#get unix epoch time dt['UnixTime'] = dt.index.astype(np.int64) // 10**9 dt = dt.fillna(0) evalu = [] for k in range(10): km = TimeSeriesKMeans(n_clusters=k + 2, verbose=True, random_state=23, metric="dtw") Y = km.fit_predict(dt.T) evalu.append(silhouette_score(dt.T, Y, metric="dtw")) # 6 clusteres is best km = TimeSeriesKMeans(n_clusters=7, verbose=True, random_state=23, metric="dtw") Y = km.fit_predict(dt.T) c1 = np.where(Y == 0)[0].tolist() c2 = np.where(Y == 1)[0].tolist() c3 = np.where(Y == 2)[0].tolist()
seed = 0 numpy.random.seed(seed) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = X_train[y_train < 4] # Keep first 3 classes numpy.random.shuffle(X_train) # Keep only 50 time series X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50]) # Make time series shorter X_train = TimeSeriesResampler(sz=40).fit_transform(X_train) sz = X_train.shape[1] # Euclidean k-means print("Euclidean k-means") km = TimeSeriesKMeans(n_clusters=3, verbose=True, random_state=seed) y_pred = km.fit_predict(X_train) plt.figure() for yi in range(3): plt.subplot(3, 3, yi + 1) for xx in X_train[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(km.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(-4, 4) plt.text(0.55, 0.85, 'Cluster %d' % (yi + 1), transform=plt.gca().transAxes) if yi == 1: plt.title("Euclidean $k$-means")
class SKU_Clusterer: def __init__(self, *args, **kwargs): #clustering params self.classifier=None self.clusters_indices = {} self.n_clusters = int(kwargs['n_clusters']) self.use_kmeans = self.n_clusters > 0 self.kmeans_iterations = int(kwargs['k_means_n_iterations']) self.k_means_metric = kwargs.get('k_means_metric', 'euclidean') if self.k_means_metric not in ['dtw', 'euclidean', 'softdtw']: print('invalid k_means metric, seting to `euclidean`', verbosity=1) self.k_means_metric = 'euclidean' #RNN params self.n_epochs = [int(p) for p in (kwargs['rnn_epochs'].split(';'))] self.n_steps = [int(p) for p in (kwargs['n_steps']).split(';')][0] self.encoder_output_units = [int(p) for p in kwargs['encoder_output_units'].split(';')] self.decoder_output_units = [int(p) for p in kwargs['decoder_output_units'].split(';')] self.batch_size = [int(p) for p in kwargs['batch_size'].split(';')] self.early_stopping = [kwargs['early_stopping'].split(';')] self.discriminative_cols = kwargs.get('discriminative_columns', None) if self.discriminative_cols: self.discriminative_cols = self.discriminative_cols.strip().split(';') #paths self.sku_path = kwargs['sku_path'] self.autoencoder_path = './models/autoencoder.pkl' self.encoder_path = './models/encoder.pkl' self.decoder_path = './models/decoder.pkl' self.classifier_path = './models/classifier.pkl' self.kmeans_path = './models/kmeans_model.pkl' self.embedder_path = './models/embedder.pkl' self.config_path = './models/clusterer_config.pkl' #other params self.full_dataset = kwargs.get('full_dataset', False) self.cold_start = True if kwargs['cold_start'] == 'True' else False self.encoding = kwargs.get('encoding', 'utf8') self._load_datasets = self._load_datasets_full if self.full_dataset == 'True' else self._load_datasets_partial if not self.cold_start: self.load_configuration() def filter_dataset(self, df): chosen_cols = [] for c in self.discriminative_cols: if c not in df.columns: print(f'invalid column name: `{c}`, omitting...', verbosity=1) else: chosen_cols.append(c) self.discriminative_cols = chosen_cols if self.discriminative_cols != []: print(f'RUNNING FILTERING on columns:{", ".join(self.discriminative_cols)}') df = df.filter(items = self.discriminative_cols) else: print('No discriminative columns passed, running algoritm on all columns') return df def _load_datasets_partial(self): datasets = [] for file in os.listdir(self.sku_path): df = pd.read_csv(os.path.join(self.sku_path, file), encoding=self.encoding, sep=';') df = self.filter_dataset(df) n_splits = df.shape[0] // self.n_steps trim = df.shape[0] % self.n_steps df = df[trim:] for split_idx in range(n_splits): chunk = df[split_idx * self.n_steps : (split_idx + 1) * self.n_steps] datasets.append(chunk.values) return np.array(datasets, dtype=np.float64) def _load_datasets_full(self): datasets = [] for file in os.listdir(self.sku_path): df = pd.read_csv(os.path.join(self.sku_path, file), encoding=self.encoding, sep=';') df = self.filter_dataset(df) for offset in range(df.shape[0] - self.n_steps): chunk = df[offset : offset + self.n_steps] datasets.append(chunk.values) return np.array(datasets, dtype=np.float64) def load_configuration(self): if not os.path.exists(self.config_path): print('Config file not found...', verbosity=1) return config = open(self.config_path, "rb") self.clusters_indices = load(config) self.n_clusters = load(config) self.use_kmeans = load(config) self.train_dataset = load(config) config.close() def save_configuration(self): config = open(self.config_path, "wb") dump(self.clusters_indices, config) dump(self.n_clusters, config) dump(self.use_kmeans, config) dump(self.train_dataset, config) config.close() def load_models(self, cold_start=False): models_exists = os.path.isfile(self.autoencoder_path) \ and os.path.isfile(self.encoder_path) \ and os.path.isfile(self.decoder_path) classifier_exists = os.path.isfile(self.classifier_path) kmeans_exists = os.path.isfile(self.kmeans_path) embedder_exists = os.path.isfile(self.embedder_path) if not (models_exists and classifier_exists): print('NO MODELS FOUND, COLD START REQUIRED...', verbosity=1) if not cold_start and models_exists: print('AUTOENCODER MODELS EXISTS, LOADING...') self.autoenc = load_model(self.autoencoder_path) self.enc = load_model(self.encoder_path) self.dec = load_model(self.decoder_path) if not cold_start and classifier_exists: print('CLASSIFIER MODEL EXISTS, LOADING...') with open(self.classifier_path, 'rb') as model_file: self.classifier = load(model_file) if not cold_start and kmeans_exists: print('K_MEANS MODEL EXISTS, LOADING...') with open(self.kmeans_path, 'rb') as model_file: self.k_means_classifier = load(model_file) if not cold_start and embedder_exists: with open(self.embedder_path, 'rb') as model_file: self.embedder = load(model_file) return models_exists and classifier_exists and embedder_exists and not cold_start def train(self, dataset=None): if dataset is None: dataset = self._load_datasets() n_features = dataset.shape[-1] if not self.load_models(self.cold_start): #Talos scan params = { 'n_steps':[self.n_steps], 'n_features':[n_features], 'epochs':self.n_epochs, 'enc_units':self.encoder_output_units, 'dec_units':self.decoder_output_units, 'batch_size':self.batch_size, 'early_stopping':self.early_stopping, 'scan':[True] } results = talos.Scan(dataset, np.zeros_like(dataset), params=params, model=create_autoencoder_models) best_params = results.data.sort_values(by=['val_loss'], ascending=True).iloc[0].to_dict() best_params['scan'] = False print('\n', '='*30, '\nBEST AUTOENCODER HYPERPARAMETERS:\n', '\n'.join([f'{key} = {value}' for key,value in best_params.items()]), '\n', '='*30) self.autoenc, self.enc, self.dec = create_autoencoder_models(dataset, np.zeros_like(dataset), params=best_params) hist = self.autoenc.history.history loss = hist['loss'] val_loss = hist['val_loss'] plt.figure(figsize=(10, 7)) plt.plot(loss, label='training_loss') plt.plot(val_loss, label='validation_loss') plt.legend() plt.title('Autoencoder loss') plt.savefig('./loss/autoencoder_loss.png') self.train_dataset = dataset classifier_inputs = self.enc.predict(dataset) self.embedder = TSNE(n_components=2, perplexity=40, random_state=42) embedded = self.embedder.fit_transform(classifier_inputs) if not self.use_kmeans: print('CLUSTER COUNT NOT SPECIFIED, CALCULATING CLUSTER NUMBER...', verbosity=1) self.u_classifier = DBSCAN(eps=3, n_jobs=-1) classes = self.u_classifier.fit_predict(embedded) self.n_clusters = len(set(classes)) self.use_kmeans = True self.k_means_classifier = TimeSeriesKMeans(n_clusters=self.n_clusters, metric=self.k_means_metric, n_init=self.kmeans_iterations, verbose=True, max_iter=1000) self.k_means_classifier.fit(embedded) self.k_means_classifier.transform = self.k_means_classifier.predict #hotfix self.clusters_indices = self.k_means_classifier.fit_predict(embedded) self.classifier = KNeighborsClassifier(n_neighbors=5, n_jobs=-1) self.classifier.fit(embedded, self.clusters_indices) with open(self.classifier_path, 'wb') as model_file: dump(self.classifier, model_file) with open(self.embedder_path, 'wb') as model_file: dump(self.embedder, model_file) with open(self.kmeans_path, 'wb') as model_file: dump(self.k_means_classifier, model_file) self.save_configuration() # ============================================================================= # Cluster visualisation # ============================================================================= clusters = self.k_means_classifier.transform(embedded) unique_clusters = set(clusters) plt.figure() for clas in unique_clusters: # for clas in unique_clusters: c = generate_color() mask = clusters == clas filtered = embedded[mask] plt.scatter(filtered[:, 0], filtered[:, 1], c=c, label=f'cluster {clas + 1}') plt.legend() plt.savefig('./clusters/clusters.png') def embed(self, dataset): flattened = self.enc.predict(dataset) embedded = self.embedder.fit_transform(flattened) return embedded def predict(self, sample): result = self.enc.predict(sample) return result def predict_class(self, sample, plot_cluster=False): extended_dataset = np.vstack(( self.train_dataset, sample.reshape(-1, *sample.shape) )) embedded_space = self.embed(extended_dataset) sample_coords = embedded_space[-1] nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(embedded_space[:-1]) distances, indices = nbrs.kneighbors(sample_coords.reshape(1, -1)) n_classes, classes_counts = np.unique(self.clusters_indices[indices], return_counts = True) cls = n_classes[np.argmax(np.unique(classes_counts))] print(distances) print(indices) print(self.clusters_indices[indices]) print(cls) if plot_cluster: plt.figure() plt.scatter(embedded_space[:,0], embedded_space[:,1]) plt.scatter(sample_coords[0], sample_coords[1], marker='x', c='red') return cls, distances, indices def compress_dataset(self, dataset): return self.enc.predict(dataset) def cluster(self, dataset, sample=None, plot_clusters=False): if sample is not None: dataset = np.vstack((sample, dataset)) compressed_dataset = self.compress_dataset(dataset) embedded_dataset = self.embedder.fit_transform(compressed_dataset) classes = self.k_means_classifier.fit_predict(embedded_dataset) if plot_clusters: plt.figure() unique_clusters = set(classes) for clas in unique_clusters: c = generate_color() mask = classes == clas filtered = embedded_dataset[mask] plt.scatter(filtered[:, 0], filtered[:, 1], c=c, label=f'cluster {clas + 1}') if sample is not None: plt.scatter(embedded_dataset[0, 0], embedded_dataset[0, 1], c='red', marker='x') plt.legend() return dataset, classes
def k_means_clustering(sd_log): """ k_means clustering of all features using dtw for multivariate time series :param sd_log: sd_log object :return: cluster_metrics_dict: dict with clusters as key and features as values """ from tslearn.clustering import TimeSeriesKMeans, silhouette_score from tslearn.utils import to_time_series_dataset from tslearn.preprocessing import TimeSeriesScalerMinMax data = sd_log.data # TODO handle outliers tmp = sd_log.waiting_time data.drop(columns=[sd_log.waiting_time], inplace=True) X = [] # Get data as numpy array for col in data.columns: X.append(sd_log.get_points(col)) # Normalize the data (y = (x - min) / (max - min)) data_norm = data.copy() for column in data_norm.columns: data_norm[column] = (data_norm[column] - data_norm[column].min()) / ( data_norm[column].max() - data_norm[column].min()) X = TimeSeriesScalerMinMax().fit_transform(X) X = to_time_series_dataset(X) # Find optimal # clusters by # looping through different configurations for # of clusters and store the respective values for silhouette: sil_scores = {} for n in range(2, len(data.columns)): model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10) model_tst.fit(X) sil_scores[n] = (silhouette_score(X, model_tst.predict(X), metric="dtw")) opt_k = max(sil_scores, key=sil_scores.get) model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10) labels = model.fit_predict(X) print(labels) # build helper df to map metrics to their cluster labels df_cluster = pd.DataFrame(list(zip(data.columns, model.labels_)), columns=['metric', 'cluster']) # make some helper dictionaries and lists cluster_metrics_dict = df_cluster.groupby( ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() cluster_len_dict = df_cluster['cluster'].value_counts().to_dict() clusters_dropped = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] == 1 ] clusters_final = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] > 1 ] print('Plotting Clusters') fig, axs = plt.subplots(opt_k) # , figsize=(10, 5)) # fig.suptitle('Clusters') row_i = 0 # column_j = 0 # For each label there is, # plots every series with that label for cluster in cluster_metrics_dict: for feat in cluster_metrics_dict[cluster]: axs[row_i].plot(data_norm[feat], label=feat, alpha=0.4) axs[row_i].legend(loc="best") if len(cluster_metrics_dict[cluster]) > 100: # TODO draw mean in red if more than one cluster tmp = np.nanmean(np.vstack(cluster), axis=1) axs[row_i].plot(tmp, c="red") axs[row_i].set_title("Cluster " + str(cluster)) row_i += 1 # column_j += 1 # if column_j % k == 0: # row_i += 1 # column_j = 0 plt.show() # return dict {cluster_id: features} return cluster_metrics_dict
df_cluster.sort_values(['cluster']) # Create data frame for customer and its cluster create_cluster_info(y_pred_ks, cols) plot_clusters(formatted_norm_dataset, y_pred_ks, clusters, ks, 'pgn_customer_cluster_{}.jpg'.format(id_unit_usaha)) # Kmeans clustering with DBA-DTW distance metric clusters = 5 dba_km = TimeSeriesKMeans(n_clusters=clusters, metric="dtw", max_iter_barycenter=20, verbose=False, random_state=seed) y_pred_dbakm = dba_km.fit_predict(formatted_norm_dataset) # Create data frame for customer and its cluster create_cluster_info(y_pred_dbakm, cols) # Plot cluster plot_clusters(formatted_norm_dataset, y_pred_dbakm, clusters, dba_km, "./plot_custers_KMean_DBA_DTW.jpg") # engine2 = sqlalchemy.create_engine( # 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') # Session = sessionmaker(bind=engine2) # session = Session() # Base = declarative_base()
def main(argv): # define global timer to obtain global execution time start_global = timer() # define globals variables global euclidean_clustered_data, \ dtw_clustered_data, \ soft_dtw_clustered_data, \ k_shape_clustered_data, \ gak_clustered_data ############################################################################################# # Input arguments parsing ############################################################################################# # define help message help_message = \ 'clustering.py -h \n\n' \ 'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \ 'by default: processing input data (without any sampling)' \ '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \ 'options list: \n' \ ' -c / --clusters <number_clusters> # set number of clusters (default 3) \n\n' \ ' -i / --ifile <input_file> # set input filename \n' \ ' -n / --normalise # normalise input data \n' \ ' -s / --standardise # standardise input data \n\n' \ ' -a / --all # perform all 5 implemented methods of clustering: \n' \ ' euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \ ' -E / --euclidean # perform euclidean k-means clustering \n' \ ' -D / --dtw # perform dtw k-means clustering \n' \ ' -S / --soft-dtw # perform soft-dtw k-means clustering \n' \ ' -K / --k-shape # perform k-shape clustering \n' \ ' -G / --gak # perform GAK k-means clustering \n' # Create new object to save arguments i_args = Arguments() # number of rows in plot to create correct number of subplots # default = 3 (raw data plus distribution histograms) n_rows_plot = 3 # define validation rules for arguments try: opts, args = getopt.getopt( argv, "hc:i:nsaEDSKG", [ "help", "clusters=", "ifile=", "normalise", "standardise", "all", "euclidean", "dtw", "soft-dtw", "k-shape", "gak" ] ) except getopt.GetoptError: print(help_message) sys.exit(2) # parse arguments for opt, arg in opts: if opt in ("-h", "--help"): print(help_message) sys.exit() elif opt in ("-c", "--clusters"): i_args.number_clusters = arg elif opt in ("-i", "--ifile"): i_args.input_file = arg elif opt in ("-n", "--normalise"): i_args.normalise_data = True elif opt in ("-s", "--standardise"): i_args.standardise_data = True elif opt in ("-E", "--euclidean"): n_rows_plot += 1 i_args.euclidean_clustering = True elif opt in ("-D", "--dtw"): n_rows_plot += 1 i_args.dtw_clustering = True elif opt in ("-S", "--soft-dtw"): n_rows_plot += 1 i_args.soft_dtw_clustering = True elif opt in ("-K", "--k-shape"): n_rows_plot += 1 i_args.k_shape_clustering = True elif opt in ("-G", "--gak"): n_rows_plot += 1 i_args.gak_clustering = True elif opt in ("-a", "--all"): n_rows_plot = 8 i_args.euclidean_clustering = True i_args.dtw_clustering = True i_args.soft_dtw_clustering = True i_args.k_shape_clustering = True i_args.gak_clustering = True # normalise maximum number of subplots levels n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot ############################################################################################# # Raw data processing stage ############################################################################################# # set style to matplotlib plot mpl.style.use('seaborn') # set seed value and seed the generator seed = 0 numpy.random.seed(seed) # import data and print first 5 rows raw_data = import_data() print(raw_data.head()) # convert raw data to the format which can be used by tslearn # (3-d dimensional array) # BUILT functionality: adjust all time series to one size # (NaN values are appended to the shorter ones) formatted_data = to_time_series_dataset(raw_data) # print shape of new array print(formatted_data.shape) # obtain number of measuring n_measuring = formatted_data.shape[1] # define figure, grid_spec to create layout of the plot fig = plt.figure(constrained_layout=True) grid_spec = fig.add_gridspec( n_rows_plot, i_args.number_clusters ) # set A4 size to figure fig.set_size_inches(8.5, 11.75) # setup count of layers of subplots count_layer = 3 # setup first subplot and draw raw time series f_ax_raw_data = fig.add_subplot(grid_spec[:2, :]) for xx in formatted_data: f_ax_raw_data.plot(xx.ravel(), alpha=.2) formatted_data_min = formatted_data.min() formatted_data_max = formatted_data.max() # draw title for chart with min and max values f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max)) # obtain and print executing time of data processing stage to console, timer_tick = get_time_tick(start_global) plt.ion() plt.show() print("Raw data processing time: %s" % timer_tick) ############################################################################################# # Data preprocessing stage ############################################################################################# start = timer() # Convert NaNs to value predicted by interpolation # linearly interpolate for NaN/NaNs n_nan_changes = 0 for ind in range(formatted_data.shape[0]): mask = numpy.isnan(formatted_data[ind]) n_nan_changes += mask.sum() formatted_data[ind][mask] = numpy.interp( numpy.flatnonzero(mask), numpy.flatnonzero(~mask), formatted_data[ind][~mask] ) print("%d NaN values was/were interpolated" % n_nan_changes) # Scaling # to know should we use normalization or standardization, we need to see # the distribution of values. # take random 3 measuring for each case to draw histograms random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False) # create new arrays with values of randomly chosen measurements histogram_data = formatted_data[:, random_indexes] # draw histograms for i_histogram in range(i_args.number_clusters): f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram]) f_ax_histogram.hist( histogram_data[:, i_histogram], bins=25, density=True ) f_ax_histogram.text(0.55, 0.98, 'Measurement #%d' % random_indexes[i_histogram], transform=plt.gca().transAxes, color="navy" ) if i_histogram == 1: preprocessing = '' if i_args.normalise_data: preprocessing += "normalised" if i_args.standardise_data: preprocessing += " and standardised" elif i_args.standardise_data: preprocessing += "standardised" preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing f_ax_histogram.set_title( "Distributions histograms %s" % preprocessing, color='navy', y=1, pad=14 ) # if no processing data option chosen continue with raw data processed_data = formatted_data # since for this concrete challenge data the distributions are more/less # Gaussian/Normal we can use standardization # normalize data: Min-Max scaling ranging between 0 and 1 if i_args.normalise_data: processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data) print("Data was normalised") # standardize data: scaling technique where the values are centered around # the mean with a unit standard deviation if i_args.standardise_data: processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data) print("Data was standardised") # obtain max value of data (to be used in visualization subplots) max_data = processed_data.max() * 1.2 min_data = processed_data.min() * 1.2 timer_tick = get_time_tick(start) print("#############################################################################################") print("Data processing stage elapsed time: %s" % timer_tick) ############################################################################################# # Implementing Euclidean k-means clustering algorithm ############################################################################################# if i_args.euclidean_clustering: start = timer() print("Euclidean k-means") # define parameters of the model of the algorithm k_means_euclidean = TimeSeriesKMeans( n_clusters=i_args.number_clusters, verbose=True, random_state=seed, n_jobs=4 ) # calculate cluster's label array euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data) # draw subplots with attributed clusters of time series as well as # cluster centers' lines for i_cluster in range(i_args.number_clusters): f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, euclidean_clustered_data, 'tab:blue') f_ax_euclidean.plot( k_means_euclidean.cluster_centers_[i_cluster].ravel(), "tab:green" ) if i_cluster == 1: middle_axis = f_ax_euclidean # increment count of filled layer of subplots count_layer += 1 # obtain processing time, print it to console and # add it to the title of the series of subplots timer_tick = get_time_tick(start) middle_axis.set_title( "Euclidean $k$-means (%s)" % timer_tick, color='tab:green', y=1, pad=14 ) print("#############################################################################################") print("Euclidean k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing DTW k-means clustering algorithm # use dtw (Dynamic Time Warping Distance) metric to calculate # distance between means ############################################################################################# if i_args.dtw_clustering: start = timer() print("DTW k-means") k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters, n_init=3, metric="dtw", verbose=True, max_iter_barycenter=10, random_state=seed, n_jobs=6 ) dtw_clustered_data = k_means_DTW.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, dtw_clustered_data, 'tab:blue') f_ax_dtw.plot( k_means_DTW.cluster_centers_[i_cluster].ravel(), "tab:red" ) if i_cluster == 1: middle_axis = f_ax_dtw # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "DTW $k$-means (%s)" % timer_tick, color='tab:red', y=1, pad=14 ) print("#############################################################################################") print("DTW k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing soft DTW k-means clustering algorithm # use soft dtw (Dynamic Time Warping Distance) metric to calculate # distance between means ############################################################################################# if i_args.soft_dtw_clustering: start = timer() print("Soft-DTW k-means") k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters, metric="softdtw", metric_params={"gamma": .025}, verbose=True, random_state=seed, n_jobs=6 ) soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, soft_dtw_clustered_data, 'tab:blue') f_ax_soft_dtw.plot( k_means_soft_DTW.cluster_centers_[i_cluster].ravel(), "tab:purple" ) if i_cluster == 1: middle_axis = f_ax_soft_dtw # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "Soft-DTW $k$-means (%s)" % timer_tick, color='tab:purple', y=1, pad=14 ) print("#############################################################################################") print("Soft-DTW k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing k-Shape clustering algorithm ############################################################################################# if i_args.k_shape_clustering: start = timer() print("K-Shape") k_shape = KShape(n_clusters=i_args.number_clusters, verbose=True, random_state=seed ) k_shape_clustered_data = k_shape.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min()) max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max()) f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_axe_value, max_axe_value, processed_data, k_shape_clustered_data, 'tab:blue') f_ax_k_shape.plot( k_shape.cluster_centers_[i_cluster].ravel(), "tab:orange" ) if i_cluster == 1: middle_axis = f_ax_k_shape # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "$K$-Shape (%s)" % timer_tick, color='tab:orange', y=1, pad=14 ) print("#############################################################################################") print("K-Shape time processing: %s" % timer_tick) ############################################################################################# # Implementing Global Alignment kernel k-means clustering algorithm # since kernel is used, there is no centroid of the cluster ############################################################################################# if i_args.gak_clustering: start = timer() print("GAK-k-means") gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters, kernel="gak", kernel_params={"sigma": "auto"}, n_init=10, verbose=True, random_state=seed, n_jobs=6 ) gak_clustered_data = gak_k_means.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, gak_clustered_data, 'tab:blue') if i_cluster == 1: middle_axis = f_ax_gak_k_means # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "Global Alignment kernel $k$-means (%s)" % timer_tick, color='tab:cyan', y=1, pad=14) print("#############################################################################################") print("GAK k-means time processing: %s" % timer_tick) ############################################################################################# # return string with current datetime now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") # define the name of the directory to be created path = "./out/%s" % now print("#############################################################################################") try: os.mkdir(path) except OSError: print("Creation of the directory %s failed" % path) else: print("Successfully created the directory %s " % path) try: # save figure as pdf to out folder fig.savefig("./out/%s/visual_result.pdf" % now) # save clustering results if i_args.euclidean_clustering: numpy.savetxt( "./out/%s/euclidean_clustering_result.csv" % now, euclidean_clustered_data, delimiter="," ) if i_args.dtw_clustering: numpy.savetxt( "./out/%s/dtw_clustering_result.csv" % now, dtw_clustered_data, delimiter="," ) if i_args.soft_dtw_clustering: numpy.savetxt( "./out/%s/soft_dtw_clustering_result.csv" % now, soft_dtw_clustered_data, delimiter="," ) if i_args.k_shape_clustering: numpy.savetxt( "./out/%s/k_shape_clustering_result.csv" % now, k_shape_clustered_data, delimiter="," ) if i_args.gak_clustering: numpy.savetxt( "./out/%s/gak_clustering_result.csv" % now, gak_clustered_data, delimiter="," ) except RuntimeError: print("Saving results failed") else: print("Successfully saved results in the path %s " % path) ############################################################################################# # obtain and print global executing time timer_tick = get_time_tick(start_global) print("#############################################################################################") print("All algorithms elapsed time: % s" % timer_tick) ############################################################################################# # render and show plot # plt.show() plt.draw() plt.pause(0.001) input("Press [enter] to finish.") print("#############################################################################################")
if args.info: plt.show(block=args.block) sys.exit(1) # ALSO # for each of the fgrps, we could take the average or a boxplot of the number of claims # per month. # ---- if args.kmeans_algo == 0: k_title = "Euclidean $k$-means" f_title = "euclidian" km = TimeSeriesKMeans(n_clusters=num, verbose=True, random_state=seed) y_pred = km.fit_predict(X) print(y_pred) elif args.kmeans_algo == 1: k_title = "DBA" f_title = "DBA_k_means" km = TimeSeriesKMeans(n_clusters=num, n_init=2, metric="dtw", verbose=True, max_iter_barycenter=10, random_state=seed) y_pred = km.fit_predict(X) print(y_pred) elif args.kmeans_algo == 2: k_title = "Soft-DTW k-means" f_title = "soft_DTW"
def mass_upload(startDate, endDate, id_unit_usaha): print(id_unit_usaha) login = "" password = "" # engine = sqlalchemy.create_engine('mysql+pymysql://energy:energy2x5=10@localhost:3306/pgn') engine = sqlalchemy.create_engine( 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') sql = " SELECT a.IDREFPELANGGAN, a.ID_UNIT_USAHA, 1 AS FSTREAMID, DATEPART(dw, a.FDATETIME) as FDAYOFWEEK, a.FHOUR, AVG(a.FDVC) as AVG_FDVC\ FROM(SELECT IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR, SUM(FDVC) as FDVC\ FROM amr_bridge\ WHERE FDATETIME >= '" + startDate + "'\ and FDATETIME < '" + endDate + "'\ GROUP BY IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR) a\ GROUP BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR\ ORDER BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR" df = pd.read_sql_query(sql, engine) totaldf = len(df) totaldf = str(totaldf) print('total Data: ' + totaldf) # rslt_df = df.loc[df['ID_UNIT_USAHA'] == '014'] # print(startDate) # print('\nResult dataframe :\n', rslt_df) # df.to_csv('pgn_customer_cluster_v1_{}.csv'.format(id_unit_usaha), index=False) # df.to_hdf("amr_bridge_22122020.hdf", key='hdf5') # df = pd.read_hdf("amr_bridge_22122020.hdf") def select_data(id_unit): query = "ID_UNIT_USAHA == '{}'".format(id_unit_usaha) columns = ['FDAYOFWEEK', 'FHOUR', 'IDREFPELANGGAN', 'AVG_FDVC'] # df = df.set_index('FDATETIME') df_selected = df.query(query, engine='python')[columns] return df_selected def pivot_data(df): # df_pivoted = df.pivot(index='FDATETIME', columns='IDREFPELANGGAN', values='FDVC') df_pivoted = df.pivot(index=['FDAYOFWEEK', 'FHOUR'], columns='IDREFPELANGGAN', values='AVG_FDVC') return df_pivoted def remove_zerocolumns(df): # Get all columns which have all zero values cols = df.columns[df.mean() == 0] # Drop columns which has all zero values df = df.drop(cols, axis=1) return df df_week1 = select_data(id_unit_usaha) df_week1.fillna(0.0, inplace=True) df_pivoted1 = pivot_data(df_week1) df_pivoted1.fillna(0.0, inplace=True) df_pivoted1 = remove_zerocolumns(df_pivoted1) cols = list(df_pivoted1.columns) df_pivoted1.head() # Function to plot cluster # def plot_clusters(ds, y_pred, n_clusters, ks, filename): # plt.figure(figsize=(12, 40)) # for yi in range(n_clusters): # plt.subplot(n_clusters, 1, 1 + yi) # for xx in ds[y_pred == yi]: # plt.plot(xx.ravel(), "k-", alpha=.2) # plt.plot(ks.cluster_centers_[yi].ravel(), "r-") # plt.xlim(0, sz) # plt.ylim(-7, 7) # plt.title("Cluster %d" % (yi)) # plt.tight_layout() # plt.savefig(filename, format='jpg', dpi=300, quality=95) # plt.show() def create_cluster_info(y_pred, cols): df_cluster = pd.DataFrame(y_pred.copy(), index=cols.copy(), columns=['cluster']) df_cluster.reset_index(inplace=True) df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True) unique_cluster = df_cluster['cluster'].unique() # Get ID ref based on cluster idrefs_list = [] for i, x in enumerate(unique_cluster): idref_list = df_cluster.query( "cluster == {}".format(x))['idrefpelanggan'].values.tolist() # idrefs_list[x] = idref_list # Create dictionary idref_cluster_dict = {'cluster': x, 'idrefpelanggan': idref_list} idrefs_list.append(idref_cluster_dict) idrefs_cluster = pd.DataFrame(idrefs_list) return idrefs_cluster # def run_once(startime, totalData, _has_run=[]): # if _has_run: # return # # print("run_once doing stuff") # print(startime) # endtime = time.time_ns() # print(endtime) # invTime = endtime-startime # estTime = invTime * totalData # _has_run.append(1) # print(totalData) # print(estTime) # return estTime seed = 0 np.random.seed(seed) # Convert data frame to list of series pivoted_series = [] pivoted_columns = [] for i, y in enumerate(cols): length = len(df_pivoted1[y]) cst = df_pivoted1[y].values pivoted_series.append(cst) pivoted_columns.append(y) # Convert data set to standar time series format formatted_dataset = to_time_series_dataset(pivoted_series) print("Data shape: {}".format(formatted_dataset.shape)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) sz = formatted_norm_dataset.shape[1] print("Data shape: {}".format(sz)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) clusters = 5 totalColumn = formatted_norm_dataset.shape[0] totalRow = formatted_norm_dataset.shape[1] totalData = totalRow * totalColumn + totalRow * clusters # ks = KShape(n_clusters=clusters, verbose=True, random_state=seed) # y_pred_ks = ks.fit_predict(formatted_norm_dataset) dba_km = TimeSeriesKMeans(n_clusters=clusters, metric="dtw", max_iter_barycenter=20, verbose=False, random_state=seed) y_pred_dbakm = dba_km.fit_predict(formatted_norm_dataset) formatted_norm_dataset.shape data = formatted_norm_dataset data.shape formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0] formatted_norm_dataset_2d.shape # pd.DataFrame(A.T.reshape(2, -1), columns=cols) df_normalized = pd.DataFrame(formatted_norm_dataset_2d) df_normalized # df_normalized = df_normalized.pivot() # formatted_norm_dataset[0] df_cluster = pd.DataFrame(y_pred_dbakm, index=pivoted_columns, columns=['cluster']) df_cluster.reset_index(inplace=True) df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True) df_cluster.sort_values(['cluster']) df_normalized_detail = pd.DataFrame.join(df_normalized, df_cluster) df_normalized_detail.to_csv("output.csv", index=False) # df_cluster.to_csv('pgn_customer_cluster_{}.csv'.format( # id_unit_usaha), index=False) # Create data frame for customer and its cluster create_cluster_info(y_pred_dbakm, cols) # plot_clusters(formatted_norm_dataset, y_pred_ks, clusters, ks, # 'pgn_customer_cluster_{}.jpg'.format(id_unit_usaha)) # engine2 = sqlalchemy.create_engine( # 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') # Session = sessionmaker(bind=engine2) # session = Session() # Base = declarative_base() # class PL_CUSTOMER_CLUSTER(Base): # __tablename__ = 'PL_CUSTOMER_CLUSTER' # ID = Column(Integer, primary_key=True) # DATE_STAMP = Column(DateTime) # IDREFPELANGGAN = Column(String(30)) # HOUR_NUM = Column(Integer) # CLUSTER_NUM = Column(Integer) # HOUR_NUM = Column(Integer) # FDVC_NORMALIZED = Column(Float) # AREA_ID = Column(String(5)) # startime = time.time_ns() # for i in range(totalColumn): # idref = df_normalized_detail.iloc[i, totalRow] # cluster = int(df_normalized_detail.iloc[i, totalRow+1]) # print("idref = " + idref) # cluster_num = df_normalized_detail.iloc[i, totalRow-1] # for j in range(totalRow): # hour_num = df_normalized_detail.columns[j] # fdvc = df_normalized_detail.iloc[i, j] # sql = "" # # insert into table # item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=idref, # HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc, AREA_ID=id_unit_usaha) # session.add(item) # # commit per id ref pelanngan # session.commit() # engine2 = sqlalchemy.create_engine( # 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') # Session = sessionmaker(bind=engine2) # session = Session() # Base = declarative_base() # class PL_CUSTOMER_CLUSTER(Base): # __tablename__ = 'PL_CUSTOMER_CLUSTER' # ID = Column(Integer, primary_key=True) # DATE_STAMP = Column(DateTime) # IDREFPELANGGAN = Column(String(30)) # HOUR_NUM = Column(Integer) # CLUSTER_NUM = Column(Integer) # HOUR_NUM = Column(Integer) # FDVC_NORMALIZED = Column(Float) # AREA_ID = Column(String(5)) # df_normalized_detail # for i in range(clusters): # print("cluster: " + str(i)) # CLUSTER_NAME = "CENTROID_ID" + str(i) # cluster = i # for j in range(totalRow): # fdvc_norm = dba_km.cluster_centers_[i][j][0] # hour_num = j # sql = "" # item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=CLUSTER_NAME, # HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc_norm, AREA_ID=id_unit_usaha) # session.add(item) # print("fdvc:" + str(fdvc_norm) + "Hour:" + str(hour_num)) # # commit per id ref pelanngan # session.commit() # print(str(j) + ", " + str(fdvc_norm)) return totalData
begin_values = m.iloc[0] for i in range(0, len(m.columns)): m.iloc[:, i] = m.iloc[:, i] / begin_values[i] x = m.to_numpy().transpose() clusters = 10 dba_km = TimeSeriesKMeans(n_clusters=clusters, n_init=2, n_jobs=24, metric="dtw", verbose=True, max_iter_barycenter=10, random_state=seed) y = dba_km.fit_predict(x) yy = pd.DataFrame(y.reshape(-1, 1)) yy['name'] = m.columns yy.set_index('name', inplace=True) yy.columns.values[0] = 'sector' for cluster in range(clusters): plt.subplot(5, 2, cluster + 1) for xx in x[y == cluster]: plt.plot(xx.ravel(), "k-", alpha=.2) #plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-") #plt.xlim(0, sz) #plt.ylim(-4, 4) #plt.text(0.55, 0.85,'Cluster %d' % (yi + 1), transform=plt.gca().transAxes) #if yi == 1: # plt.title("Soft-DTW $k$-means")
min_cluster = 2 max_cluster = 21 silhouette_score_dict = {} sse_dict = {} label_dict = {} silhouette_score_dict["time-series-k-means"] = [] sse_dict["time-series-k-means"] = [] label_dict["time-series-k-means"] = {} # silhouette_score_dict["k-shape"] = [] # silhouette_score_dict["global-alignment-kernel-k-means"] = [] for i in range(min_cluster, max_cluster): print(service + "-cluster:" + str(i)) km = TimeSeriesKMeans(n_clusters=i, verbose=True) label = km.fit_predict(X_train) silhouette_score_dict["time-series-k-means"].append( silhouette_score(X_train, label, metric="dtw")) sse_dict["time-series-k-means"].append(km.inertia_) label_dict["time-series-k-means"][i] = label # km = GlobalAlignmentKernelKMeans(n_clusters=i, verbose=True) # label = km.fit_predict(X_train) # silhouette_score_dict["global-alignment-kernel-k-means"].append(silhouette_score(X_train, label, metric="dtw")) # km = KShape(n_clusters=i, verbose=True) # label = km.fit_predict(X_train) # silhouette_score_dict["k-shape"].append(silhouette_score(X_train, label, metric="dtw")) s1 = str(silhouette_score_dict) s2 = str(sse_dict)
dishwasher_omp = result['dishwasher'].RecSignal[0:len(appliance_dict['dishwasher'])] fridgefreezer_omp = result['fridgefreezer'].RecSignal[0:len(appliance_dict['fridgefreezer'])] kettle_omp = result['kettle'].RecSignal[0:len(appliance_dict['kettle'])] microwave_omp = result['microwave'].RecSignal[0:len(appliance_dict['microwave'])] half_washer = np.zeros((result['washerdryer'].Kcoef.shape[1],result['washerdryer'].Kcoef.shape[0],1)) for i in range(result['washerdryer'].Kcoef.shape[1]): half_washer[i,:,0] = result['washerdryer'].Kcoef[:,i] # half_washer += result['washerdryer'].Kcoef dba_km = TimeSeriesKMeans(n_clusters=2, n_init=2, metric="dtw", verbose=True, max_iter_barycenter=10) y_pred = dba_km.fit_predict(half_washer) washer_labels = dba_km.labels_ summed_washer_clusters = {} for g in np.unique(washer_labels): summed_washer_clusters[g] = np.zeros(half_washer.shape[1]) for l in np.where(washer_labels == g): for i in range(len(l)): temp = half_washer[l[i],:,0] summed_washer_clusters[g] += temp # %% cdict = {0: 'red', 1: 'blue', 2: 'green'} for i in range(len(half_washer)): plt.plot(half_washer[i,:,0], label=washer_labels[i], color=cdict[washer_labels[i]])#'blue' if labels[i] == 1 else 'green') plt.ylim([0,4000]) plt.legend()
class NonMyopicEarlyClassifier(ClassifierMixin, TimeSeriesBaseEstimator): """Early Classification modelling for time series using the model presented in [1]_. Parameters ---------- n_clusters : int Number of clusters to form. base_classifier : Estimator or None Estimator (instance) to be cloned and used for classifications. If None, the chosen classifier is a 1NN with Euclidean metric. min_t : int Earliest time at which a classification can be performed on a time series lamb : float Value of the hyper parameter lambda used during the computation of the cost function to evaluate the probability that a time series belongs to a cluster given the time series. cost_time_parameter : float Parameter of the cost function of time. This function is of the form : f(time) = time * cost_time_parameter random_state: int Random state of the base estimator Attributes -------------------- classifiers_ : list A list containing all the classifiers trained for the model, that is, (maximum_time_stamp - min_t) elements. pyhatyck_ : array like of shape (maximum_time_stamp - min_t, n_cluster, __n_classes, __n_classes) Contains the probabilities of being classified as class y_hat given class y and cluster ck for a trained classifier. The penultimate dimension of the array is associated to the true class of the series and the last dimension to the predicted class. pyck_ : array like of shape (__n_classes, n_cluster) Contains the probabilities of being of true class y given a cluster ck X_fit_dims : tuple of the same shape as the training dataset Examples -------- >>> dataset = to_time_series_dataset([[1, 2, 3, 4, 5, 6], ... [1, 2, 3, 4, 5, 6], ... [1, 2, 3, 4, 5, 6], ... [1, 2, 3, 3, 2, 1], ... [1, 2, 3, 3, 2, 1], ... [1, 2, 3, 3, 2, 1], ... [3, 2, 1, 1, 2, 3], ... [3, 2, 1, 1, 2, 3]]) >>> y = [0, 0, 0, 1, 1, 1, 0, 0] >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=1000., ... cost_time_parameter=.1, ... random_state=0) >>> model.fit(dataset, y) # doctest: +ELLIPSIS NonMyopicEarlyClassifier(...) >>> print(type(model.classifiers_)) <class 'dict'> >>> print(model.pyck_) [[0. 1. 1.] [1. 0. 0.]] >>> preds, pred_times = model.predict_class_and_earliness(dataset) >>> preds array([0, 0, 0, 1, 1, 1, 0, 0]) >>> pred_times array([4, 4, 4, 4, 4, 4, 1, 1]) >>> pred_probas, pred_times = model.predict_proba_and_earliness(dataset) >>> pred_probas array([[1., 0.], [1., 0.], [1., 0.], [0., 1.], [0., 1.], [0., 1.], [1., 0.], [1., 0.]]) >>> pred_times array([4, 4, 4, 4, 4, 4, 1, 1]) References ---------- .. [1] A. Dachraoui, A. Bondu & A. Cornuejols. Early classification of time series as a non myopic sequential decision making problem. ECML/PKDD 2015 """ def __init__(self, n_clusters=2, base_classifier=None, min_t=1, lamb=1., cost_time_parameter=1., random_state=None): super(NonMyopicEarlyClassifier, self).__init__() self.base_classifier = base_classifier self.n_clusters = n_clusters self.min_t = min_t self.lamb = lamb self.cost_time_parameter = cost_time_parameter self.random_state = random_state @property def classes_(self): if hasattr(self, 'classifiers_'): return self.classifiers_[self.min_t].classes_ else: return None def fit(self, X, y): """ Fit early classifier. Parameters ---------- X : array-like of shape (n_series, n_timestamps, n_features) Training data, where `n_series` is the number of time series, `n_timestamps` is the number of timestamps in the series and `n_features` is the number of features recorded at each timestamp. y : array-like of shape (n_samples,) Target values. Will be cast to X's dtype if necessary Returns ------- self : returns an instance of self. """ X = check_array(X, allow_nd=True) X = check_dims(X) X = to_time_series_dataset(X) y_arr = np.array(y) label_set = np.unique(y_arr) self.cluster_ = TimeSeriesKMeans(n_clusters=self.n_clusters, random_state=self.random_state) if self.base_classifier is not None: clf = self.base_classifier else: clf = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric="euclidean") self.__n_classes_ = len(label_set) self._X_fit_dims = X.shape sz = X.shape[1] self.classifiers_ = {t: clone(clf) for t in range(self.min_t, sz + 1)} self.pyhatyck_ = np.empty((sz - self.min_t + 1, self.n_clusters, self.__n_classes_, self.__n_classes_)) c_k = self.cluster_.fit_predict(X) X1, X2, c_k1, c_k2, y1, y2 = train_test_split( X, c_k, y_arr, test_size=0.5, stratify=c_k, random_state=self.random_state ) label_to_ind = {lab: ind for ind, lab in enumerate(label_set)} y_ = np.array([label_to_ind.get(lab, self.__n_classes_ + 1) for lab in y_arr]) vector_of_ones = np.ones((X.shape[0], )) self.pyck_ = coo_matrix( (vector_of_ones, (y_, c_k)), shape=(self.__n_classes_, self.n_clusters), ).toarray() self.pyck_ /= self.pyck_.sum(axis=0, keepdims=True) for t in range(self.min_t, sz + 1): self.classifiers_[t].fit(X1[:, :t], y1) for k in range(0, self.n_clusters): index = (c_k2 == k) if index.shape[0] != 0: X2_current_cluster = X2[index, :t] y2_current_cluster = y2[index] y2_hat = self.classifiers_[t].predict( X2_current_cluster[:, :t] ) conf_matrix = confusion_matrix(y2_current_cluster, y2_hat, labels=label_set) # normalize parameter seems to be quite recent in sklearn, # so let's do it ourselves normalizer = conf_matrix.sum(axis=0, keepdims=True) normalizer[normalizer == 0] = 1 # Avoid divide by 0 conf_matrix = conf_matrix / normalizer # pyhatyck_ stores # P_{t+\tau}(\hat{y} | y, c_k) \delta_{y \neq \hat{y}} # elements so it should have a null diagonal because of # the \delta_{y \neq \hat{y}} term np.fill_diagonal(conf_matrix, 0) self.pyhatyck_[t - self.min_t, k] = conf_matrix return self def get_cluster_probas(self, Xi): r"""Compute cluster probability :math:`P(c_k | Xi)`. This quantity is computed using the following formula: .. math:: P(c_k | Xi) = \frac{s_k(Xi)}{\sum_j s_j(Xi)} where .. math:: s_k(Xi) = \frac{1}{1 + \exp{-\lambda \Delta_k(Xi)}} with .. math:: \Delta_k(Xi) = \frac{\bar{D} - d(Xi, c_k)}{\bar{D}} and :math:`\bar{D}` is the average of the distances between `Xi` and the cluster centers. Parameters ---------- Xi: numpy array, shape (t, d) A time series observed up to time t Returns ------- probas : numpy array, shape (n_clusters, ) Examples -------- >>> from tslearn.utils import to_time_series >>> dataset = to_time_series_dataset([[1, 2, 3, 4, 5, 6], ... [1, 2, 3, 4, 5, 6], ... [1, 2, 3, 4, 5, 6], ... [1, 2, 3, 3, 2, 1], ... [1, 2, 3, 3, 2, 1], ... [1, 2, 3, 3, 2, 1], ... [3, 2, 1, 1, 2, 3], ... [3, 2, 1, 1, 2, 3]]) >>> y = [0, 0, 0, 1, 1, 1, 0, 0] >>> ts0 = to_time_series([1, 2]) >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=0., ... random_state=0) >>> probas = model.fit(dataset, y).get_cluster_probas(ts0) >>> probas.shape (3,) >>> probas # doctest: +ELLIPSIS array([0.33..., 0.33..., 0.33...]) >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=10000., ... random_state=0) >>> probas = model.fit(dataset, y).get_cluster_probas(ts0) >>> probas.shape (3,) >>> probas array([0.5, 0.5, 0. ]) >>> ts1 = to_time_series([3, 2]) >>> model.get_cluster_probas(ts1) array([0., 0., 1.]) """ Xi = check_array(Xi) diffs = Xi[np.newaxis, :] - self.cluster_.cluster_centers_[:, :len(Xi)] distances_clusters = np.linalg.norm(diffs, axis=(1, 2)) average_distance = np.mean(distances_clusters) delta_k = 1. - distances_clusters / average_distance s_k = 1. / (1. + np.exp(-self.lamb * delta_k)) return s_k / s_k.sum() def _expected_costs(self, Xi): r"""Compute expected future costs from an incoming time series `Xi`. This cost is computed, for a time horizon :math:`\tau`, as: .. math:: \sum_k P(c_k | Xi) \sum_y P(y | c_k) \sum_\hat{y} P_{t+\tau}(\hat{y} | y, c_k) \delta_{y \neq \hat{y}} where: * :math:`P(c_k | Xi)` is obtained through a call to `get_cluster_probas` * :math:`P(y | c_k)` is stored in `pyck_` * :math:`P_{t+\tau}(\hat{y} | y, c_k) \delta_{y \neq \hat{y}}` is stored in `pyhatyck_` Parameters ---------- Xi: numpy array, shape (t, d) A time series observed up to time t Returns -------- cost : numpy array of shape (self.__len_X_ - t + 1, ) Expected future costs for all time stamps from t to self.__len_X_ Examples -------- >>> from tslearn.utils import to_time_series >>> dataset = to_time_series_dataset([[1, 2, 3, 4, 5, 6], ... [1, 2, 3, 4, 5, 6], ... [1, 2, 3, 4, 5, 6], ... [1, 2, 3, 3, 2, 1], ... [1, 2, 3, 3, 2, 1], ... [1, 2, 3, 3, 2, 1], ... [3, 2, 1, 1, 2, 3], ... [3, 2, 1, 1, 2, 3]]) >>> y = [0, 0, 0, 1, 1, 1, 0, 0] >>> ts1 = to_time_series([3, 2]) >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=10000., ... cost_time_parameter=1., ... random_state=0) >>> costs = model.fit(dataset, y)._expected_costs(ts1) >>> costs.shape (5,) >>> costs # doctest: +ELLIPSIS array([2., 3., 4., 5., 6.]) """ proba_clusters = self.get_cluster_probas(Xi=Xi) truncated_t = Xi.shape[0] # pyhatyck_ is indexed by: t, k, y, yhat sum_pyhatyck = np.sum( self.pyhatyck_[truncated_t - self.min_t:], axis=-1 ) sum_pyhatyck = np.transpose(sum_pyhatyck, axes=(0, 2, 1)) # sum_pyhatyck is now indexed by: t, y, k sum_global = np.sum(sum_pyhatyck * self.pyck_[np.newaxis, :], axis=1) cost = np.dot(sum_global, proba_clusters) return cost + self._cost_time(np.arange(truncated_t, self._X_fit_dims[1] + 1)) def _get_prediction_time(self, Xi): """Compute optimal prediction time for the incoming time series Xi. """ time_prediction = None for t in range(self.min_t, self._X_fit_dims[1] + 1): tau_star = np.argmin(self._expected_costs(Xi=Xi[:t])) if (t == self._X_fit_dims[1]) or (tau_star == 0): time_prediction = t break return time_prediction def _predict_single_series(self, Xi): """ This function classifies a single time series xt Parameters ---------- xt: vector a time series that is probably incomplete but that nonetheless we want to classify Returns ------- int: the class which is predicted int : the time of the prediction float : the probability used for computing the cost float : the loss when classifying """ t = self._get_prediction_time(Xi) pred = self.classifiers_[t].predict([Xi[:t]])[0] return pred, t def _predict_single_series_proba(self, Xi): """ This function classifies a single time series xt Parameters ---------- Xi: vector a time series that is probably incomplete but that nonetheless we want to classify Returns ------- int: the class which is predicted int : the time of the prediction float : the probability used for computing the cost float : the loss when classifying """ t = self._get_prediction_time(Xi) pred = self.classifiers_[t].predict_proba([Xi[:t]])[0] return pred, t def predict_class_and_earliness(self, X): """ Provide predicted class as well as prediction timestamps. Prediction timestamps are timestamps at which a prediction is made in early classification setting. Parameters ---------- X : array-like of shape (n_series, n_timestamps, n_features) Vector to be scored, where `n_series` is the number of time series, `n_timestamps` is the number of timestamps in the series and `n_features` is the number of features recorded at each timestamp. Returns ------- array, shape (n_samples,) Predicted classes. array-like of shape (n_series, ) Prediction timestamps. """ X = check_array(X, allow_nd=True) check_is_fitted(self, '_X_fit_dims') X = check_dims(X, X_fit_dims=self._X_fit_dims, check_n_features_only=True) y_pred = [] time_prediction = [] for i in range(0, X.shape[0]): cl, t = self._predict_single_series(X[i]) y_pred.append(cl) time_prediction.append(t) return np.array(y_pred), np.array(time_prediction) def predict(self, X): """ Provide predicted class. Parameters ---------- X : array-like of shape (n_series, n_timestamps, n_features) Vector to be scored, where `n_series` is the number of time series, `n_timestamps` is the number of timestamps in the series and `n_features` is the number of features recorded at each timestamp. Returns ------- array, shape (n_samples,) Predicted classes. """ return self.predict_class_and_earliness(X)[0] def predict_proba_and_earliness(self, X): """ Provide probability estimates as well as prediction timestamps. Prediction timestamps are timestamps at which a prediction is made in early classification setting. The returned estimates for all classes are ordered by the label of classes. Parameters ---------- X : array-like of shape (n_series, n_timestamps, n_features) Vector to be scored, where `n_series` is the number of time series, `n_timestamps` is the number of timestamps in the series and `n_features` is the number of features recorded at each timestamp. Returns ------- array-like of shape (n_series, n_classes) Probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. array-like of shape (n_series, ) Prediction timestamps. """ X = check_array(X, allow_nd=True) check_is_fitted(self, '_X_fit_dims') X = check_dims(X, X_fit_dims=self._X_fit_dims, check_n_features_only=True) y_pred = [] time_prediction = [] for i in range(0, X.shape[0]): probas, t = self._predict_single_series_proba(X[i]) y_pred.append(probas) time_prediction.append(t) return np.array(y_pred), np.array(time_prediction) def predict_proba(self, X): """ Probability estimates. The returned estimates for all classes are ordered by the label of classes. Parameters ---------- X : array-like of shape (n_series, n_timestamps, n_features) Vector to be scored, where `n_series` is the number of time series, `n_timestamps` is the number of timestamps in the series and `n_features` is the number of features recorded at each timestamp. Returns ------- array-like of shape (n_series, n_classes) Probability of the sample for each class in the model, where classes are ordered as they are in ``self.classes_``. """ return self.predict_proba_and_earliness(X)[0] def _cost_time(self, t): return t * self.cost_time_parameter def early_classification_cost(self, X, y): r""" Compute early classification score. The score is computed as: .. math:: 1 - acc + \alpha \frac{1}{n} \sum_i t_i where :math:`\alpha` is the trade-off parameter (`self.cost_time_parameter`) and :math:`t_i` are prediction timestamps. Parameters ---------- X : array-like of shape (n_series, n_timestamps, n_features) Vector to be scored, where `n_series` is the number of time series, `n_timestamps` is the number of timestamps in the series and `n_features` is the number of features recorded at each timestamp. y : array-like, shape = (n_samples) or (n_samples, n_outputs) True labels for X. Returns ------- float Early classification cost (a positive number, the lower the better) Examples -------- >>> dataset = to_time_series_dataset([[1, 2, 3, 4, 5, 6], ... [1, 2, 3, 4, 5, 6], ... [1, 2, 3, 4, 5, 6], ... [1, 2, 3, 3, 2, 1], ... [1, 2, 3, 3, 2, 1], ... [1, 2, 3, 3, 2, 1], ... [3, 2, 1, 1, 2, 3], ... [3, 2, 1, 1, 2, 3]]) >>> y = [0, 0, 0, 1, 1, 1, 0, 0] >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=1000., ... cost_time_parameter=.1, ... random_state=0) >>> model.fit(dataset, y) # doctest: +ELLIPSIS NonMyopicEarlyClassifier(...) >>> preds, pred_times = model.predict_class_and_earliness(dataset) >>> preds array([0, 0, 0, 1, 1, 1, 0, 0]) >>> pred_times array([4, 4, 4, 4, 4, 4, 1, 1]) >>> model.early_classification_cost(dataset, y) 0.325 """ y_pred, pred_times = self.predict_class_and_earliness(X) acc = accuracy_score(y, y_pred) return (1. - acc) + np.mean(self._cost_time(pred_times)) def _more_tags(self): # Because some of the data validation checks rely on datasets that are # too small to pass here (only 1 item in one of the clusters, hence no # stratified split possible) return {"no_validation": True}
# # for yi in range(2): # plt.subplot(3, 3, 4 + yi) # for xx in X_train[y_pred == yi]: # plt.plot(xx.ravel(), "k-", alpha=.2) # plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-") # plt.xlim(0, sz) # plt.ylim(-4, 4) # if yi == 1: # plt.title("DBA $k$-means") # Soft-DTW-k-means print("Soft-DTW k-means") sdtw_km = TimeSeriesKMeans(n_clusters=2, metric="softdtw", metric_params={"gamma_sdtw": .01}, verbose=True, random_state=seed) y_pred = sdtw_km.fit_predict(xie_1) for yi in range(3): plt.subplot(3, 3, 7 + yi) for xx in X_train[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(-4, 4) if yi == 1: plt.title("Soft-DTW $k$-means") plt.subplot(3, 3, 3) for xx in X_train: plt.plot(xx.ravel(), "g-", alpha=.2)
def main(args): if args.data == 'simulation': window_size = 50 path = './data/simulated_data/' n_cluster = 4 augment = 5 if args.data == 'wf': window_size = 2500 path = './data/waveform_data/processed' n_cluster = 4 augment = 500 if args.data == 'har': window_size = 5 path = './data/HAR_data/' n_cluster = 6 augment = 100 with open(os.path.join(path, 'x_train.pkl'), 'rb') as f: x = pickle.load(f) with open(os.path.join(path, 'state_train.pkl'), 'rb') as f: y = pickle.load(f) with open(os.path.join(path, 'x_test.pkl'), 'rb') as f: x_test = pickle.load(f) with open(os.path.join(path, 'state_test.pkl'), 'rb') as f: y_test = pickle.load(f) T = x.shape[-1] t = np.random.randint(window_size, T - window_size, len(x) * augment) x_window = np.array([ x[i // augment, :, tt - window_size // 2:tt + window_size // 2] for i, tt in enumerate(t) ]) y_window = np.round( np.mean( np.array([ y[i // augment, tt - window_size // 2:tt + window_size // 2] for i, tt in enumerate(t) ]), -1)) if args.data == 'wf': minority_index = np.logical_or(y_window == 1, y_window == 2) rand_index = np.random.randint(0, len(y_window), 200) y_window = np.concatenate( [y_window[minority_index], y_window[rand_index]], 0) x_window = np.concatenate( [x_window[minority_index], x_window[rand_index]], 0) x_window = x_window.transpose((0, 2, 1)) # shape:[n_samples, t_len, d] x_window = x_window[:, ::2, :] # Decimate measurements for efficiency else: x_window = x_window.transpose((0, 2, 1)) # shape:[n_samples, t_len, d] t = np.random.randint(window_size, T - window_size, len(x_test) * augment) x_test_window = np.array([ x_test[i // augment, :, tt - window_size // 2:tt + window_size // 2] for i, tt in enumerate(t) ]) y_test_window = np.round( np.mean( np.array([ y_test[i // augment, tt - window_size // 2:tt + window_size // 2] for i, tt in enumerate(t) ]), -1)) if 0: #args.data =='wf': minority_index = np.logical_or(y_test_window == 1, y_test_window == 2) rand_index = np.random.randint(0, len(y_test_window), 150) y_test = np.concatenate( [y_test_window[minority_index], y_test_window[rand_index]], 0) x_test = np.concatenate( [x_test_window[minority_index], x_test_window[rand_index]], 0) x_test_window = x_test.transpose( (0, 2, 1)) # shape:[n_samples, t_len, d] x_test = x_test_window[:, :: 2, :] # Decimate measurements for efficiency else: y_test = y_test_window x_test = x_test_window x_test = x_test.transpose((0, 2, 1)) # shape:[n_samples, t_len, d] accuracy, s_score, db_score, auc, auprc = [], [], [], [], [] for cv in range(3): shuffled_inds = list(range(len(x_window))) random.shuffle(shuffled_inds) x_window = x_window[shuffled_inds] y_window = y_window[shuffled_inds] if args.data == 'wf': n_train = int(0.7 * len(x_window)) x_train = x_window[:n_train] y_train = y_window[:n_train] x_test = x_window[n_train:] y_test = y_window[n_train:] else: x_train = x_window y_train = y_window knn = KNeighborsTimeSeries(n_neighbors=args.K, metric='dtw').fit(x_train) kmeans = TimeSeriesKMeans(n_clusters=n_cluster, metric='dtw') cluster_labels = kmeans.fit_predict(x_test) dist, ind = knn.kneighbors(x_test, return_distance=True) predictions = np.array( [y_train[np.bincount(preds).argmax()] for preds in ind]) y_onehot = np.zeros((len(y_test), n_cluster)) y_onehot[np.arange(len(y_onehot)), y_test.astype(int)] = 1 prediction_onehot = np.zeros((len(y_test), n_cluster)) prediction_onehot[np.arange(len(prediction_onehot)), predictions.astype(int)] = 1 accuracy.append(accuracy_score(y_test, predictions)) auc.append(roc_auc_score(y_onehot, prediction_onehot)) auprc.append(average_precision_score(y_onehot, prediction_onehot)) s_score.append( silhouette_score(x_test.reshape((len(x_test), -1)), cluster_labels)) db_score.append( davies_bouldin_score(x_test.reshape((len(x_test), -1)), cluster_labels)) print('\nSummary performance:') print('Accuracy: ', np.mean(accuracy) * 100, '+-', np.std(accuracy) * 100) print('AUC: ', np.mean(auc), '+-', np.std(auc)) print('AUPRC: ', np.mean(auprc), '+-', np.std(auprc)) print('Silhouette score: ', np.mean(s_score), '+-', np.std(s_score)) print('Davies Bouldin score: ', np.mean(db_score), '+-', np.std(db_score))
def dtw_clustering(data): # TODO find n_clusters using elbow method model = TimeSeriesKMeans(n_clusters=5, metric="dtw", max_iter=1000) y_pred = model.fit_predict(data) return y_pred
def kmeans(data,clusters,year_of_interest,njobs): #get data seed = 5 np.random.seed(seed) #print('data shape is: ',data.shape) X_train=data.T #print(X_train) #print('shape is ', X_train.shape, ' before scaling') #X_train = np.expand_dims(np.zeros(X_train.shape),axis=2) #print('the shape after expand dims is: ', X_train.shape) #print('example before: ',X_train[:,10]) #np.random.shuffle(X_train) #print(X_train.shape) #print('x train is: ', X_train) #X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train) #print(X_train.shape) #X_train[:,:-1]=np.squeeze(TimeSeriesScalerMeanVariance().fit_transform(X_train[:,:-1]),axis=2) #changed from 2 to 1 #print('x train is now: ', X_train) X_train = np.nan_to_num(X_train) #print('example after is: ', X_train) #print('X_train shape is ', X_train.shape, ' after scaling') #X_train = np.squeeze(X_train, axis=2) #X_train_labeled = np.append(X_train,labels) sz = X_train.shape[1] # Euclidean k-means # print("Euclidean k-means") # km = TimeSeriesKMeans(n_clusters=clusters, verbose=True, random_state=seed,n_jobs=20) # #print('km is ',km) # y_pred = km.fit_predict(np.nan_to_num(X_train))#[:,0][0]) # #print(y_pred) # #print('y_pred is',y_pred) # cluster_dict = {} # cluster_centers = {} # plt.figure(figsize=(10,10)) # #uncomment calls to plot if you want to see the figures # for yi in range(clusters): # #print('cluster is: ', yi+1) # time_series = {} #changed from list to dict # plt.subplot(10, 5, yi + 1) # #count = 0 # for xx in X_train[y_pred == yi]: # #time_series.append(xx[-1]) #removed [-1] # time_series.update({xx[-1]:xx}) # #print('xxshape is ',xx.shape) # #print('count is: ',count) # #count +=1 # #print(xx[1]) # plt.plot(xx.ravel(), "darkblue", alpha=.2) # cluster_dict.update({f'cluster_{yi+1}':time_series}) # cluster_centers.update({f'cluster_{yi+1}':km.cluster_centers_[yi]}) # plt.plot(km.cluster_centers_[yi].ravel(), "r-") # #print(km.cluster_centers_[yi].shape) # plt.xlim(0, sz) # plt.ylim(-10, 10) # plt.text(0.55, 0.85,'Cluster %d' % (yi + 1), # transform=plt.gca().transAxes) # if yi == 1: # plt.title("Euclidean $k$-means") #print(cluster_dict) #cluster_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in cluster_dict.items() ])) #print(cluster_df.iloc[:,0][0]) # DBA-k-means print("DBA k-means") dba_km = TimeSeriesKMeans(n_clusters=clusters, n_init=3, metric="dtw", verbose=True, max_iter_barycenter=10, random_state=seed, n_jobs=njobs) y_pred = dba_km.fit_predict(np.nan_to_num(X_train)) cluster_dict = {} cluster_centers = {} plt.figure(figsize=(10,10)) for yi in range(clusters): #print('the cluster number is: ', yi+1) time_series = {} #changed from list to dict plt.subplot(10, 5, yi + 1) #plt.subplot(clusters, clusters, (clusters+1) + yi) for xx in X_train[y_pred == yi]: #print('the time series in this cluster look like: ',xx) time_series.update({xx[-1]:xx}) plt.plot(xx.ravel(), "darkblue", alpha=.2) cluster_dict.update({f'cluster_{yi+1}':time_series}) cluster_centers.update({f'cluster_{yi+1}':dba_km.cluster_centers_[yi]}) plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(0, 1) #changed from -10,10 plt.text(0.55, 0.85,'Cluster %d' % (yi + 1), transform=plt.gca().transAxes) if yi == 1: plt.title(f"DBA $k$-means {year_of_interest}") # #Soft-DTW-k-means # print("Soft-DTW k-means") # sdtw_km = TimeSeriesKMeans(n_clusters=clusters, # metric="softdtw", # metric_params={"gamma": .01}, # verbose=True, # random_state=seed, # n_jobs=10) # y_pred = sdtw_km.fit_predict(np.nan_to_num(X_train)) # for yi in range(clusters): # plt.subplot(clusters, clusters, ((clusters*2)+1) + yi) # for xx in X_train[y_pred == yi]: # plt.plot(xx.ravel(), "k-", alpha=.2) # plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-") # plt.xlim(0, sz) # plt.ylim(-10, 10) # plt.text(0.55, 0.85,'Cluster %d' % (yi + 1), # transform=plt.gca().transAxes) # if yi == 1: # plt.title("Soft-DTW $k$-means") plt.tight_layout() #plt.show() #plt.close(fig) plt.clf() plt.close('all') #print(cluster_centers) return cluster_dict,cluster_centers
class Kmean(cs): """Classe de partitionnement des donnees avec l'algorithm K-mean Parameters: * ss : SeriesSupp instance du manager de series temporelles Variables: * seed: int Valeur d'initialisation de l'algo, random. * counter: Counter repartition des objets au sein des clusters * km: TimeSeriesKMeans Instance de l'algo * clust_name: String Nom de l'algo(affichage des plots) * metric: String Choix du metrics utilise, principalement softdtw ici car tres efficace et rapide """ def __init__(self, ss): super().__init__(ss) self.seed = 0 np.random.seed(self.seed) self.counter = None self.km = None self.clust_name = "Kmean" self.metric = "softdtw" def k_init(self, v = True): """ initialisation de l'instance de l'algorithm avec les parametres actuels Parameters: * v: boolean Verbose, affiche les info lie au partitionnement Returns: NA """ self.km = TimeSeriesKMeans(n_clusters = self.n, metric = self.metric, metric_params = {"gamma_sdtw": .01}, verbose = v, random_state = self.seed) def k_fit(self): """ Effectue le partitionnement Parameters: NA Returns: NA """ self.ts_clust = self.km.fit_predict(self.ts) def cluster_counter(self): """ Compte les objets au sein des clusters Parameters: NA Returns: NA """ self.counter = Counter(self.ts_clust)
distance = dtw.distance(time_series_scaled[i], time_series_scaled[j], psi=1) distance_matrix[i, j] = distance # now, let's call our clustering algorithms, with a pre-computed distance matrix if False: # commented out, at the moment plt.figure(figsize=(10, 7)) plt.title("Dendrograms") dend = shc.dendrogram( shc.linkage( time_series_scaled, method='ward', metric=lambda u, v: distance_matrix(time_series_scaled.index(u), time_series_scaled.index(v)))) plt.axhline(y=cutoff_distance, color='r', linestyle='--') plt.savefig(os.path.join(folder_name, "hierarchical-dendogram.pdf")) print("Performing agglomerative clustering...") n_clusters = 10 clusterer = TimeSeriesKMeans(n_clusters=n_clusters, metric='dtw', random_state=0) clusterer.fit_predict(time_series_scaled) print("Clusters:") for c in range(0, clusterer.labels_.max() + 1): print("- Cluster #%d has %d time series" % (c, sum(clusterer.labels_ == c)))
print(f"\tLabels Calculated, Elapsed: {time.time() - t}") #sc = silhouette_score(X, labels_bis, metric="softdtw") import pickle pickle.dump(distortions, open("output/ts_distortions.pkl", "wb")) plt.plot(range(2, 13), distortions, 'bx-') plt.xlabel('k') plt.ylabel('Distortion') plt.title('The Elbow Method showing the optimal k') plt.show() """ # Setting the algorithm as the cluster with the elbow in the curve (from the above loop, it was 3) km_bis = TimeSeriesKMeans(n_clusters=3, metric="softdtw") labels_bis = km_bis.fit_predict(X) # Plot the clusters centroids = km_bis.cluster_centers_ color_palette = {"0": "#01AFB8", "1": "#196E9F", "2": "#D3D3D3"} for i in range(0,3): col = color_palette[str(i)] plt.scatter(X[labels_bis == i , 0] , X[labels_bis == i , 1] , label = i, color = col) plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'black') plt.legend() plt.annotate("The black dots indicate the cluster's centroid", xy = (0,0)) plt.show() # Append the clusters to the severity dataframe to_merge = pd.DataFrame({"FIPS":fips, "label":labels_bis}) mh_historical = pd.merge(mh_historical, to_merge, on= "FIPS", how = "left")
def clusters(): """ Display correlation plot for the researcher. This webpage is only for the role researcher. :return: Correlation plot """ check_access_right(forbidden='user', redirect_url='control.index') currentVideo, vid_dict, n_clusters = get_video_information( request.args.get('vid'), request.args.get('cluster')) _, data = collect_mongodbobjects(currentVideo[0]) ### set desired amount of clusters clustervals = np.arange(1, 10, 1) if _ == False or data.empty: return render_template("researcher/clusters.html", the_div="There is no data for this video!", the_script="", vid_dict=vid_dict, currentVideo=currentVideo, currentCluster=n_clusters, clustervals=clustervals, currentVariable='-', variable_list=[]) data, currentVariable, variable_list = extract_variable( data, request.args.get('variable')) interpolators, max_t = get_interpolators(data, currentVariable) xs = np.arange(0, int(max_t) + 1.5, 1) # Generate data user_timeseries = [[interpolator(xs)] for interpolator in interpolators] seed = np.random.randint(0, int(1e5), 1)[0] np.random.seed(seed) # Set cluster count # n_clusters = 3 if n_clusters > np.array(user_timeseries).shape[0]: n_clusters = np.array(user_timeseries).shape[0] # Euclidean k-means km = TimeSeriesKMeans(n_clusters=n_clusters, verbose=True, random_state=seed) y_pred = km.fit_predict(user_timeseries) # Generate plots and calculate statistics all_plots = "" all_scripts = "" plots = [] ### TODO MAYBE: intra-cluster correlation with rpy2. Might not work with matrices """ valmatrix = np.empty([24,151]) for iii in range(24): valmatrix[iii, :] = user_timeseries[iii][0] print(type(valmatrix), valmatrix.shape) print(type(valmatrix[0]), len(valmatrix[0])) print(type(valmatrix[0][0])) r_icc = importr("ICC", lib_loc="C:/Users/Lauri Lode/Documents/R/win-library/3.4") #m = r.matrix(FloatVector(valmatrix.flatten()), nrow=24) df = DataFrame({"groups": IntVector(y_pred), "values": FloatVector(valmatrix.flatten())}) icc_res = r_icc.ICCbare("groups", "values", data=df) icc_val = icc_res[0] print("ICC" + str(icc_val))""" for yi in range(n_clusters): p = figure() n = 0 values = km.cluster_centers_[yi].ravel() centerMean = np.mean(km.cluster_centers_[yi].ravel()) varsum = 0 for xx in range(0, len(y_pred)): if y_pred[xx] == yi: n = n + 1 for iii in range(len(user_timeseries[xx][0])): varsum = varsum + eucl(user_timeseries[xx][0][iii], values[iii]) / len( user_timeseries[xx][0]) p.line(range(0, len(user_timeseries[xx][0])), user_timeseries[xx][0], line_width=0.3) varsum = np.sqrt(varsum) titleString = "C#" + str(yi + 1) + ", n: " + str(n) + ", μ: " + str( np.round(centerMean, decimals=3)) + ", σ: " + str( np.round(varsum, decimals=3)) + ", σ²: " + str( np.round(varsum**2, decimals=3)) t = Title() t.text = titleString p.title = t p.line(range(0, len(values)), values, line_width=2) plots.append(p) # Get plot codes script, div = components( gridplot(plots, ncols=3, plot_width=350, plot_height=300)) return render_template("researcher/clusters.html", the_div=div, the_script=script, vid_dict=vid_dict, currentVideo=currentVideo, currentCluster=n_clusters, clustervals=clustervals, variable_list=variable_list, currentVariable=currentVariable)
# making an DataFrame to store words in column names and dates in indexes. tfidf_monthly_dataframe = pd.DataFrame( tfidf_avg_monthly.toarray(), columns=vocabulary["word"], index=pd.to_datetime({ "year": months_grouped.year, "month": months_grouped.month, "day": 1 }), ) # time series - each row(word) is one time serie # each time series is an array of 72 months. time_series = to_time_series(tfidf_monthly_dataframe.values.transpose()) N_clusters = 7 model = TimeSeriesKMeans(N_clusters) vocabulary["cluster"] = model.fit_predict(time_series) # mapping cluster numbers to colors colors = pd.DataFrame(pl.cm.jet(np.linspace(0, 1, N_clusters))) vocabulary.sort_values(["cluster", "relevance"], inplace=True, ascending=False) # getting the most relevant words for each topic topics = (vocabulary[["cluster", "word"]].groupby("cluster").agg({ "word": lambda words: ", ".join(words[:15]), })).reset_index().rename({'Index': 'cluster'}) clusters_centers = pd.DataFrame( model.cluster_centers_.reshape((N_clusters, -1)).transpose(), columns=topics["word"], index=pd.to_datetime({ "year": months_grouped.year,
else: pngfile = "ts8plot_fgrp" + str( args.fgrp) + "_" + args.start_month + "_" + str(args.months) + "m.png" if os.path.exists(pngfile): os.remove(pngfile) fig.savefig(pngfile, dpi=300) print("Saved", pngfile) if args.info: plt.show(block=True) sys.exit(1) # ---- km = TimeSeriesKMeans(n_clusters=num, verbose=True, random_state=seed) y_pred = km.fit_predict(X) print(y_pred) plt.figure(figsize=(8, 2 * num)) for yi in range(num): plt.subplot(num, 1, yi + 1) for xx in X[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(km.cluster_centers_[yi].ravel(), "r-") #plt.xlim(0, sz) #plt.ylim(0, 4) if yi == 0: plt.title("Euclidean $k$-means") plt.tight_layout() # ----
newarray = np.dstack(data) print(newarray.shape) # To get the shape to be Nx10x10, you could use rollaxis: newarray = np.rollaxis(newarray, -1) print(newarray.shape) seed = 0 # Keep only 50 time series X_train = TimeSeriesScalerMeanVariance().fit_transform(newarray[:280]) # Make time series shorter #X_train = TimeSeriesResampler(sz=40).fit_transform(X_train) sz = X_train.shape[1] # Euclidean k-means print("Euclidean k-means") km = TimeSeriesKMeans(n_clusters=4, verbose=True, random_state=seed) y_pred = km.fit_predict(X_train) plt.figure() for yi in range(4): #plt.subplot(2, 2, yi + 1) for xx in X_train[y_pred == yi]: plt.subplot(2, 2, yi + 1) plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(km.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(-4, 4) # plt.text(0.55, 0.85,'Cluster %d' % (yi + 1), # transform=plt.gca().transAxes) if yi == 1: plt.title("Euclidean $k$-means")
# Set up a K range to iterate through for DTW-KMeans model K_range = np.arange(args.Kmin, args.Kmax, args.stepsize) # Three ways to measure the performances of DTW-KMeans model Sum_of_squared_distances = [] ch_indexs = [] silhouette_scores = [] for n_clusters in K_range: # soft-DTW-Kmeans # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility clusterer = TimeSeriesKMeans(n_clusters=n_clusters, metric="softdtw", metric_params={"gamma": .01}, verbose=False, random_state=10) cluster_labels = clusterer.fit_predict(gene_expression_matrix) print('The Shape of Cluster Centers are {}'.format( clusterer.cluster_centers_.shape)) # The squared distance for Elbow Method # Select optimal number of clusters by fitting the model # with a range of K values Sum_of_squared_distances.append(clusterer.inertia_) print("For n_clusters =", n_clusters, "The sum of squared distance is :", clusterer.inertia_) #Compute the Calinski and Harabasz score. #This gives ratio between the within-cluster dispersion and the # between-cluster dispersion. ch_score = calinski_harabasz_score(gene_expression_matrix, cluster_labels) ch_indexs.append(ch_score) print("For n_clusters =", n_clusters, "The calinski_harabasz_score is :", ch_score)
st.set_option('deprecation.showPyplotGlobalUse', False) data, ts_data, = joblib.load("data/data.job") location = st.sidebar.selectbox('Location:', ts_data['Location'].unique()) clusters = st.sidebar.slider('Clusters:', 2, 6) ts = to_time_series_dataset( ts_data[ts_data.Location == location].TimeSeries.values) st.subheader(f"Location: {location}, Devices: {len(ts)}, Clusters: {clusters}") st.text("") km = TimeSeriesKMeans(n_clusters=clusters, metric="dtw", n_jobs=7) labels = km.fit_predict(ts) df = ts_data[ts_data.Location == location].copy() df['Cluster'] = labels.T for cluster in np.sort(np.unique(labels)): cdf = df[df.Cluster == cluster] for k, s in cdf.TimeSeries.items(): s = pd.DataFrame(s, columns=['CPU Idle']) s['Hour'] = s.index sns.lineplot(data=s, x="Hour", y="CPU Idle", alpha=0.1) sns.lineplot(data=km.cluster_centers_[cluster], legend=False) plt.title(f"Cluster: {cluster}", size=12) st.pyplot() # st.dataframe(cdf[['Device', 'Day']].reset_index(drop=True), width=500)
ytrain = pickle.load(open(data_path + "training_labels.pck","rb")) # x_train = TimeSeriesScalerMinMax().fit_transform(xtrain[:260]) #shapes comparison x_train = TimeSeriesScalerMeanVariance().fit_transform(xtrain[:500]) #variance comparison x_train = TimeSeriesResampler(sz=500).fit_transform(x_train) sz = x_train.shape[1] print("DBA k-means") dba_km = TimeSeriesKMeans(n_clusters=10, n_init=1, metric="dtw", verbose=True, max_iter_barycenter=10, random_state=seed) y_pred = dba_km.fit_predict(x_train) plt.figure() for yi in range(10): plt.subplot(10, 1, yi+1) for xx in x_train[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(-4, 4) plt.text(0.55, 0.85,'Cluster %d' % (yi + 1), transform=plt.gca().transAxes) if yi == 1: plt.title("DBA $k$-means") plt.show()
def main(args): data_dir = './Data/User Categorization/' if args.method == 'K': print('Working on K-means clustering') ts_dataset = [] #Only take the first 500 unique ID's n_samples = 500 for i in range(n_samples): csv_file = pd.read_csv(data_dir + str(i) + '.csv') time_series_df = csv_file[(~csv_file['f_1'].isnull()) & (~csv_file['f_2'].isnull())] time_series_seq = list(time_series_df[['f_1', 'f_2', 'f_3']].values) ts_dataset.append(time_series_seq) #Preparing Time-series dataset formatted_dataset = to_time_series_dataset(ts_dataset) silhouette_scores = [] n_clusters = [2, 3, 4, 5, 6] for cluster in n_clusters: km = TimeSeriesKMeans(n_clusters=cluster, metric="dtw", verbose=True, max_iter=5) y_pred = km.fit_predict(formatted_dataset) s_score = silhouette_score(formatted_dataset, y_pred, metric="dtw") silhouette_scores.append(s_score) sns.lineplot(x=n_clusters, y=silhouette_scores, sort=False) #Optimal clusters km = TimeSeriesKMeans(n_clusters=2, metric="dtw", verbose=True, max_iter=5) y_pred = km.fit_predict(formatted_dataset) df = pd.DataFrame(data=y_pred, columns=['Cluster No.']) df.to_csv('./kmeans_clustering.csv', index=False) #Visualise Clusters sz = formatted_dataset.shape[1] plt.figure(figsize=(20, 20)) for yi in range(2): plt.subplot(3, 3, 2 + yi) for xx in formatted_dataset[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(km.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(-500000, 500000) plt.text(0.55, 0.85, 'Cluster %d' % (yi + 1), transform=plt.gca().transAxes) if yi == 1: plt.title("DTW $k$-means") plt.tight_layout() plt.show() if args.method == 'H': #Hierarchical clustering print('Working on Hierarchical clustering') #Build distance matrix manual_dist_matrix = True n_samples = 500 if manual_dist_matrix == False: distance_matrix = np.zeros(shape=(n_samples, n_samples)) for i in range(n_samples): for j in range(n_samples): sequence_1_df = pd.read_csv('./Data/User Categorization/' + str(i) + '.csv') sequence_2_df = pd.read_csv('./Data/User Categorization/' + str(j) + '.csv') seq_1 = sequence_1_df[(~sequence_1_df['f_1'].isnull()) & (~sequence_1_df['f_2'].isnull())] seq_2 = sequence_2_df[(~sequence_2_df['f_1'].isnull()) & (~sequence_2_df['f_2'].isnull())] x = seq_1[['f_1', 'f_2', 'f_3']].values y = seq_2[['f_1', 'f_2', 'f_3']].values distance, path = fastdtw(x, y, dist=euclidean) if i != j: distance_matrix[i, j] = distance savetxt('distance_matrix.csv', distance_matrix, delimiter=',') distance_matrix = np.genfromtxt('distance_matrix.csv', delimiter=',') linkage_matrix = hierarchical_clustering(distance_matrix) # select maximum number of clusters cluster_labels = fcluster(linkage_matrix, 4, criterion='maxclust') print(np.unique(cluster_labels)) categorization_df = [] files_list = os.listdir('./Data/User Categorization') for files in files_list: csv_file = pd.read_csv('./Data/User Categorization/' + str(files)) unique_id = files[:-4] csv_file['ID'] = unique_id categorization_df.append(csv_file) df = pd.concat(categorization_df, axis=0, ignore_index=True) #filter out null values filtered_df = df[(~df['f_1'].isnull()) & (~df['f_2'].isnull())] df_vis = filtered_df.sort_values(by='ID') df_vis['ID'] = df_vis['ID'].astype('int') df_vis = df_vis[df_vis['ID'] <= 499].sort_values(by='ID').reset_index( drop=True) df_vis_fil = df_vis.groupby('ID')['f_1', 'f_2', 'f_3'].mean().reset_index() df_vis_fil['Cluster'] = cluster_labels df_vis_fil.to_csv('./hier_clustering.csv', index=False) #Plotting Visualisation 3D scatterplot fig = plt.figure() ax = fig.add_subplot(111, projection='3d') x = np.array(df_vis_fil['f_1']) y = np.array(df_vis_fil['f_2']) z = np.array(df_vis_fil['f_3']) ax.scatter(x, y, z, marker="s", c=df_vis_fil["Cluster"], cmap="RdBu") plt.show() else: print('Please input K or H clustering method correctly')
for i in appliance_id_list: sub_df = df[df['appliance_id'] == i].iloc[:, 2:] num = sub_df.shape[0] #if num<=10 ---k=1 #if num>10 ---k=2 if (num <= 10): k = 1 else: k = 2 ts_array = sub_df.values ts_scaled = TimeSeriesScalerMeanVariance().fit_transform(ts_array) km = TimeSeriesKMeans(n_clusters=k, metric="dtw", verbose=True, random_state=0) y_pred = km.fit_predict(ts_scaled) # n=np.argmax(np.bincount(y_pred)) ts_array_all.append(km.cluster_centers_.ravel()) #找到每个设备的聚类中心后,在对聚类中心做一次聚类 ts_array_all = np.array(ts_array_all) ts_scaled = TimeSeriesScalerMeanVariance().fit_transform(ts_array_all) km = TimeSeriesKMeans(n_clusters=6, metric="dtw", verbose=True, random_state=0) y_pred = km.fit_predict(ts_scaled) #画图的部分 sample_size = 10000 #ts_array_all的行数 def get_proportion(y_pred): df_y = pd.DataFrame(y_pred, columns=['y'])
(first_time_series_test, second_time_series_test)) multivariate_time_series_test = to_time_series(multivariate_test) print(multivariate_time_series_test.shape) #clustering from tslearn.clustering import TimeSeriesKMeans, KernelKMeans, silhouette_score #fit the algorithm on train data #tune the hyperparameters possible metrics: euclidean, dtw, softdtw km_dba = TimeSeriesKMeans(n_clusters=2, metric="softdtw", max_iter=5, max_iter_barycenter=5, random_state=0).fit(multivariate_time_series_train) km_dba.cluster_centers_.shape #prediction on train data prediction_train = km_dba.fit_predict(multivariate_time_series_train, y=None) len(prediction_train) #prediction on test data prediction_test = km_dba.fit_predict(multivariate_time_series_test, y=None) len(prediction_test) prediction_test #accuracy of the clustering on the train data silhouette_score(multivariate_time_series_train, prediction_train, metric="softdtw") #accuracy of the clustering on the test data silhouette_score(multivariate_time_series_test, prediction_test, metric="softdtw")
def GetClustersMonthlyAvg(self, sites, variableCode, n_cluster=3, methodCode=None, qualityControlLevelCode=None, timeUTC=False): """ Gets "n" number of clusters using dtw time series interpolation for a given variable Args: sites: response from the GetSites() function. Performance of the fuction can be given if the resuls of the GetSitesByVariable() function is passed instead. variableCode: string representing the variable code for the time series clusters of the given sites. n_clusters: integer representing the number of cluster to form. methodCode: method code for data extraction for the given variable. qualityControlLevelCode: The ID of the quality control level.Typically 0 is used for raw dataand 1 is used for quality controlled data. To get a list of possible quality controllevel IDs, see qualityControlLevelCode column in the output of GetSiteInfo(). If qualityControlLevelCode is not specified, then the observations in the output data.frame won’t befiltered by quality control level code. timeUTC: Boolean to use the UTC time instead of the time of the observation. Returns: An array of arrays of the following structure [monthly averages array, cluster_id] [[[0.141875, 0.1249375, 0.0795, 0.12725, 0.0877, 0.0, 0.09375, 0.1815, 0.15437499999999998, 0.164625, 0.1614, 0.20900000000000002], 1], [[0.1, 0.08662500000000001, 0.0414025, 0.048, 0.052, 0.0, 0.1105, 0.015, 0.06625, 0.10587500000000001, 0.0505, 0.046125], 0], [[0.2265, 0.27225, 0.17407499999999998, 0.13475, 0.14525, 0.129, 0.17825, 0.210625, 0.103125, 0.0, 0.23675], 2]] Example:: url_testing = "http://hydroportal.cuahsi.org/para_la_naturaleza/cuahsi_1_1.asmx?WSDL" water = WaterMLOperations(url = url_testing) sites = water.GetSites() firstSiteFullSiteCode = sites[0]['fullSiteCode'] siteInfo = water.GetSiteInfo(firstSiteFullSiteCode)['siteInfo'] clusters = water.getClustersMonthlyAvg(sites,siteInfo[0]['variableCode']) """ timeseries = [] timeSerie_cluster = [] try: for site in sites: # site_full_code = f'{site["network"]}:{site["sitecode"]}' site_full_code = site['fullSiteCode'] try: siteInfo = self.GetSiteInfo(site_full_code)['siteInfo'] for sinfo in siteInfo: if sinfo['variableCode'] == variableCode: variable_full_code = sinfo['fullVariableCode'] start_date = sinfo['beginDateTime'].split('T')[0] end_date = sinfo['endDateTime'].split('T')[0] if timeUTC is True: start_date = sinfo['beginDateTimeUTC'].split( 'T')[0] end_date = sinfo['endDateTimeUTC'].split( 'T')[0] variableResponse = self.GetValues( site_full_code, variable_full_code, start_date, end_date, methodCode=methodCode, qualityControlLevelCode=qualityControlLevelCode ) m_avg = self.GetMonthlyAverage(variableResponse) timeseries.append(to_time_series(m_avg)) timeSerie_cluster.append([m_avg]) break except Exception as e: print(e) print("the current site does not contain siteInformation") formatted_time_series = to_time_series_dataset(timeseries) model = TimeSeriesKMeans(n_clusters=n_cluster, metric="dtw", max_iter=10) y_pred = model.fit_predict(formatted_time_series) for tc, y in zip(timeSerie_cluster, y_pred): tc.append(y) return timeSerie_cluster except KeyError as e: # print(e) return timeSerie_cluster return timeSerie_cluster