def tsclusteringN(ts_data, names): # クラスタリング # 正規化 ts_dataset = TimeSeriesScalerMinMax().fit_transform(ts_data) metric = 'dtw' n_clusters = [n for n in range(2, 6)] for n in n_clusters: print('クラスター数 =', n) # metricが「DTW」か「softdtw」なら異なるデータ数の時系列データでもOK km = TimeSeriesKMeans(n_clusters=n, metric=metric, verbose=False, random_state=1).fit(ts_dataset) # クラスタリングの結果 print('クラスタリング結果 =', km.labels_) # -1から1の範囲の値。シルエット値が1に近く、かつシルエット値をプロットしたシルエット図でクラスター間の幅の差が最も少ないクラスター数が最適 # 今回はシルエット値のみを確認 print('シルエット値 =', silhouette_score(ts_dataset, km.labels_, metric=metric)) print()
def visualize_n_cluster(train_ts, n_lists=[3, 4, 5, 6], metric='dtw', seed=2021, vis=True): if vis: fig = plt.figure(figsize=(20, 5)) plt.title('군집 개수별 건물수 분포', fontsize=15, y=1.2) plt.axis('off') for idx, n in enumerate(n_lists): ts_kmeans = TimeSeriesKMeans(n_clusters=n, metric=metric, random_state=seed) train_ts['cluster(n={})'.format(n)] = ts_kmeans.fit_predict(train_ts) score = round( silhouette_score(train_ts, train_ts['cluster(n={})'.format(n)], metric='euclidean'), 3) vc = train_ts['cluster(n={})'.format(n)].value_counts() if vis: ax = fig.add_subplot(1, len(n_lists), idx + 1) sns.barplot(x=vc.index, y=vc, palette='Pastel1') ax.set(title='n_cluster={0}\nscore:{1}'.format(n, score)) if vis: plt.tight_layout() plt.show() return train_ts
def elbowmethod(df: pd.DataFrame): """ Function that finds the best number of clusters and plots a graph of the distortion with the number of clusters Args: pd.DataFrame: Columns are (country(index), year_week, value) Returns: int: Best number of cluster """ import numpy as np from tslearn.clustering import silhouette_score import matplotlib.pyplot as plt distortions = [] inertias = [] mapping1 = {} mapping2 = {} K = range(2, df.shape[0]) for k in K: # Building and fitting the model model = TimeSeriesKMeans(n_clusters=k, metric="softdtw", max_iter=50) model.fit(df.values[..., np.newaxis]) distortions.append( silhouette_score(df, model.labels_, metric="softdtw")) plt.plot(K, distortions, 'bx-') plt.xlabel('Values of K') plt.ylabel('Distortion') plt.title('The Elbow Method using Distortion') plt.show() best_num_cluster = np.argmax(distortions) + 2 return best_num_cluster
def visualize_n_cluster(train_ts, n_lists=[3,4,5,6],metric='dtw',seed=2021,vis=True): for idx,n in enumerate(n_lists): ts_kmeans=TimeSeriesKMeans(n_clusters=n, metric=metric, random_state=seed) train_ts['cluster(n={})'.format(n)]=ts_kmeans.fit_predict(train_ts) score=round(silhouette_score(train_ts,train_ts['cluster(n={})'.format(n)],metric='euclidean'),3) return train_ts
def shape_score(self, data, labels, metric='dtw'): """ :param df: :param labels: :param metric: :return: """ score = silhouette_score(data, labels, metric) return score
def cluster_time_series(ts_sample, cluster_alg, n_clusters, cluster_metric, score=False): # Dataframe to store cluster results clust_df = pd.DataFrame(ts_sample.tslist.tolist(), index=ts_sample.tslist.index).reset_index() clust_df.columns.values[3:] = ts_sample.sample_dates # Fit model if cluster_alg == "GAKM": km = clust.GlobalAlignmentKernelKMeans(n_clusters=n_clusters) if cluster_alg == "TSKM": km = clust.TimeSeriesKMeans(n_clusters=n_clusters, metric=cluster_metric) # Add predicted cluster labels to cluster results dataframe labels = km.fit_predict(ts_sample.ts_dataset) clust_df['cluster'] = labels if score: s = silhouette_score(ts_sample.ts_dataset, labels) return clust_df, s return clust_df
metric="softdtw", max_iter=5, max_iter_barycenter=5, random_state=0).fit(multivariate_time_series_train) km_dba.cluster_centers_.shape #prediction on train data prediction_train = km_dba.fit_predict(multivariate_time_series_train, y=None) len(prediction_train) #prediction on test data prediction_test = km_dba.predict(multivariate_time_series_test) len(prediction_test) prediction_test #accuracy of the clustering on the train data silhouette_score(multivariate_time_series_train, prediction_train, metric="softdtw") #accuracy of the clustering on the test data silhouette_score(multivariate_time_series_test, prediction_test, metric="softdtw") ############################################ k=2 ######################################### #select randomly time series from first cluster cluster1 = multivariate_time_series_train[prediction_train == 0] random.shuffle(cluster1) sample1 = cluster1[50:65]
def subseqeuence_clustering(sequence, changepoints, y_label='y', norm=False): """ Clusters subsequences of time series indicated by the changepoints variable. Uses silhouette score to determine the number of clusters :param y_label: Name of y-label in plot :param norm: normlise data using MinMaxScaler :param sequence: np array of the time series :param changepoints: detected changepoints on which subseuqences are build :return: """ from tslearn.clustering import TimeSeriesKMeans, silhouette_score from tslearn.utils import to_time_series_dataset from tslearn.preprocessing import TimeSeriesScalerMinMax sub_ids = [] x_index = [] X = [] i = 0 end_p = [len(sequence) - 1] for cp in changepoints + end_p: X.append(sequence[i:cp]) index = 'sub_' + str(i) + '_' + str(cp) sub_ids.append(index) x_index.append([x_id for x_id in range(i, cp + 1)]) i = cp # Normalize the data (y = (x - min) / (max - min)) if norm: X = TimeSeriesScalerMinMax().fit_transform(X) X = to_time_series_dataset(X) # Find optimal # clusters by # looping through different configurations for # of clusters and store the respective values for silhouette: sil_scores = {} for n in range(2, len(changepoints)): model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10) model_tst.fit(X) sil_scores[n] = (silhouette_score(X, model_tst.predict(X), metric="dtw")) opt_k = max(sil_scores, key=sil_scores.get) print('Number of Clusters in subsequence clustering: ' + str(opt_k)) model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10) labels = model.fit_predict(X) print(labels) # build helper df to map metrics to their cluster labels df_cluster = pd.DataFrame(list(zip(sub_ids, x_index, model.labels_)), columns=['metric', 'x_index', 'cluster']) cluster_metrics_dict = df_cluster.groupby( ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() print('Plotting Clusters') # plot changepoints as vertical lines for cp in changepoints: plt.axvline(x=cp, ls=':', lw=2, c='0.65') # preprocessing for plotting cluster based x_scat = [] y_scat = [] cluster = [] for index, row in df_cluster.iterrows(): x_seq = row['x_index'] x_scat.extend(x_seq) y_seq = sequence[x_seq[0]:x_seq[-1] + 1] y_scat.extend(y_seq) label_seq = [row['cluster']] cluster.extend(label_seq * len(x_seq)) # plt.scatter(x_seq, y_seq, label=label_seq) # plotting cluster based x_scat = np.array(x_scat) y_scat = np.array(y_scat) for c in np.unique(cluster): i = np.where(cluster == c) plt.scatter(x_scat[i], y_scat[i], label=c) plt.legend() plt.title('Subsequence k-means Clustering') plt.xlabel('Time index') plt.ylabel(y_label) plt.show() return cluster_metrics_dict
def main(args): data_dir = './Data/User Categorization/' if args.method == 'K': print('Working on K-means clustering') ts_dataset = [] #Only take the first 500 unique ID's n_samples = 500 for i in range(n_samples): csv_file = pd.read_csv(data_dir + str(i) + '.csv') time_series_df = csv_file[(~csv_file['f_1'].isnull()) & (~csv_file['f_2'].isnull())] time_series_seq = list(time_series_df[['f_1', 'f_2', 'f_3']].values) ts_dataset.append(time_series_seq) #Preparing Time-series dataset formatted_dataset = to_time_series_dataset(ts_dataset) silhouette_scores = [] n_clusters = [2, 3, 4, 5, 6] for cluster in n_clusters: km = TimeSeriesKMeans(n_clusters=cluster, metric="dtw", verbose=True, max_iter=5) y_pred = km.fit_predict(formatted_dataset) s_score = silhouette_score(formatted_dataset, y_pred, metric="dtw") silhouette_scores.append(s_score) sns.lineplot(x=n_clusters, y=silhouette_scores, sort=False) #Optimal clusters km = TimeSeriesKMeans(n_clusters=2, metric="dtw", verbose=True, max_iter=5) y_pred = km.fit_predict(formatted_dataset) df = pd.DataFrame(data=y_pred, columns=['Cluster No.']) df.to_csv('./kmeans_clustering.csv', index=False) #Visualise Clusters sz = formatted_dataset.shape[1] plt.figure(figsize=(20, 20)) for yi in range(2): plt.subplot(3, 3, 2 + yi) for xx in formatted_dataset[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(km.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(-500000, 500000) plt.text(0.55, 0.85, 'Cluster %d' % (yi + 1), transform=plt.gca().transAxes) if yi == 1: plt.title("DTW $k$-means") plt.tight_layout() plt.show() if args.method == 'H': #Hierarchical clustering print('Working on Hierarchical clustering') #Build distance matrix manual_dist_matrix = True n_samples = 500 if manual_dist_matrix == False: distance_matrix = np.zeros(shape=(n_samples, n_samples)) for i in range(n_samples): for j in range(n_samples): sequence_1_df = pd.read_csv('./Data/User Categorization/' + str(i) + '.csv') sequence_2_df = pd.read_csv('./Data/User Categorization/' + str(j) + '.csv') seq_1 = sequence_1_df[(~sequence_1_df['f_1'].isnull()) & (~sequence_1_df['f_2'].isnull())] seq_2 = sequence_2_df[(~sequence_2_df['f_1'].isnull()) & (~sequence_2_df['f_2'].isnull())] x = seq_1[['f_1', 'f_2', 'f_3']].values y = seq_2[['f_1', 'f_2', 'f_3']].values distance, path = fastdtw(x, y, dist=euclidean) if i != j: distance_matrix[i, j] = distance savetxt('distance_matrix.csv', distance_matrix, delimiter=',') distance_matrix = np.genfromtxt('distance_matrix.csv', delimiter=',') linkage_matrix = hierarchical_clustering(distance_matrix) # select maximum number of clusters cluster_labels = fcluster(linkage_matrix, 4, criterion='maxclust') print(np.unique(cluster_labels)) categorization_df = [] files_list = os.listdir('./Data/User Categorization') for files in files_list: csv_file = pd.read_csv('./Data/User Categorization/' + str(files)) unique_id = files[:-4] csv_file['ID'] = unique_id categorization_df.append(csv_file) df = pd.concat(categorization_df, axis=0, ignore_index=True) #filter out null values filtered_df = df[(~df['f_1'].isnull()) & (~df['f_2'].isnull())] df_vis = filtered_df.sort_values(by='ID') df_vis['ID'] = df_vis['ID'].astype('int') df_vis = df_vis[df_vis['ID'] <= 499].sort_values(by='ID').reset_index( drop=True) df_vis_fil = df_vis.groupby('ID')['f_1', 'f_2', 'f_3'].mean().reset_index() df_vis_fil['Cluster'] = cluster_labels df_vis_fil.to_csv('./hier_clustering.csv', index=False) #Plotting Visualisation 3D scatterplot fig = plt.figure() ax = fig.add_subplot(111, projection='3d') x = np.array(df_vis_fil['f_1']) y = np.array(df_vis_fil['f_2']) z = np.array(df_vis_fil['f_3']) ax.scatter(x, y, z, marker="s", c=df_vis_fil["Cluster"], cmap="RdBu") plt.show() else: print('Please input K or H clustering method correctly')
def k_means_clustering(sd_log): """ k_means clustering of all features using dtw for multivariate time series :param sd_log: sd_log object :return: cluster_metrics_dict: dict with clusters as key and features as values """ from tslearn.clustering import TimeSeriesKMeans, silhouette_score from tslearn.utils import to_time_series_dataset from tslearn.preprocessing import TimeSeriesScalerMinMax data = sd_log.data # TODO handle outliers tmp = sd_log.waiting_time data.drop(columns=[sd_log.waiting_time], inplace=True) X = [] # Get data as numpy array for col in data.columns: X.append(sd_log.get_points(col)) # Normalize the data (y = (x - min) / (max - min)) data_norm = data.copy() for column in data_norm.columns: data_norm[column] = (data_norm[column] - data_norm[column].min()) / ( data_norm[column].max() - data_norm[column].min()) X = TimeSeriesScalerMinMax().fit_transform(X) X = to_time_series_dataset(X) # Find optimal # clusters by # looping through different configurations for # of clusters and store the respective values for silhouette: sil_scores = {} for n in range(2, len(data.columns)): model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10) model_tst.fit(X) sil_scores[n] = (silhouette_score(X, model_tst.predict(X), metric="dtw")) opt_k = max(sil_scores, key=sil_scores.get) model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10) labels = model.fit_predict(X) print(labels) # build helper df to map metrics to their cluster labels df_cluster = pd.DataFrame(list(zip(data.columns, model.labels_)), columns=['metric', 'cluster']) # make some helper dictionaries and lists cluster_metrics_dict = df_cluster.groupby( ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict() cluster_len_dict = df_cluster['cluster'].value_counts().to_dict() clusters_dropped = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] == 1 ] clusters_final = [ cluster for cluster in cluster_len_dict if cluster_len_dict[cluster] > 1 ] print('Plotting Clusters') fig, axs = plt.subplots(opt_k) # , figsize=(10, 5)) # fig.suptitle('Clusters') row_i = 0 # column_j = 0 # For each label there is, # plots every series with that label for cluster in cluster_metrics_dict: for feat in cluster_metrics_dict[cluster]: axs[row_i].plot(data_norm[feat], label=feat, alpha=0.4) axs[row_i].legend(loc="best") if len(cluster_metrics_dict[cluster]) > 100: # TODO draw mean in red if more than one cluster tmp = np.nanmean(np.vstack(cluster), axis=1) axs[row_i].plot(tmp, c="red") axs[row_i].set_title("Cluster " + str(cluster)) row_i += 1 # column_j += 1 # if column_j % k == 0: # row_i += 1 # column_j = 0 plt.show() # return dict {cluster_id: features} return cluster_metrics_dict
return xs, ys if __name__=='__main__': TIMELINE = 'C:/Users/Vivian Imbriotis/Desktop/2018-04-26_01_CFEB106/1-3-2018-04-26_01_CFEB106_1/metadata matlab/2018-04-26_01_CFEB106_Timeline.mat' SPKS = 'C:/Users/Vivian Imbriotis/Desktop/2018-04-26_01_CFEB106/1-3-2018-04-26_01_CFEB106_1/npy/spks.npy' TIMEPOINTS = 200 MAX_K = 6 stamps = get_frame_times(TIMELINE) scores = [] models = [] data = np.load(SPKS) for i in range(2,MAX_K+1): model = get_fitted_model(data, clusters = i, timepoints = TIMEPOINTS, verbose = True) models.append(model) for model in models: scores.append(silhouette_score( data[:,0:TIMEPOINTS], model.labels_, metric = 'dtw', n_jobs = -1, verbose = True)) plt.plot(list(range(2,MAX_K+1)),scores) plt.title("Performance of KMeans on axonal data") plt.xlabel('Number of clusters') plt.ylabel('Silhouette Score') plt.show()
def test_elbow(X, dtw_value, seed): print(len(X)) distortions = [] silhouette_value = [] dists = dtw_value print(dists) if seed == -1: for seed in range(0, 21): cur_silhouette = [seed] cur_distortions = [seed] for i in range(2, 15): print(i) km = KMedoids(n_clusters=i, random_state=seed, metric="precomputed", init='k-medoids++', max_iter=30000) km.fit(dists) # 记录误差和 cur_distortions.append(km.inertia_) y_pred = km.fit_predict(dists) np.fill_diagonal(dists, 0) score = silhouette_score(dists, y_pred, metric="precomputed") cur_silhouette.append(score) distortions.append(cur_distortions) silhouette_value.append(cur_silhouette) with open(r".//res//grid_distortions_destination.csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) for row in distortions: writer.writerow(row) print(row) with open(r".//res//grid_silhouette_destination.csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) for row in silhouette_value: writer.writerow(row) print(row) else: csv_reader = csv.reader( open(".//res//grid_distortions_destination.csv", encoding='UTF-8')) for row in csv_reader: distortions.append([float(item) for item in row]) csv_reader = csv.reader( open(".//res//grid_silhouette_destination.csv", encoding='UTF-8')) for row in csv_reader: silhouette_value.append([float(item) for item in row]) chosen_distortions = distortions[seed][1:] chosen_silhouette = silhouette_value[seed][1:] plt.figure(1) plt.plot(range(2, 15), chosen_distortions, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.savefig(r'.//res//grid_distortions_destination.png') plt.close() plt.figure(1) plt.bar(range(2, 15), chosen_silhouette, color='grey') plt.xlabel('Number of clusters') plt.ylabel('Silhouette score') plt.savefig(r'.//res//grid_silhouette_destination.png')
max_cluster = 21 silhouette_score_dict = {} sse_dict = {} label_dict = {} silhouette_score_dict["time-series-k-means"] = [] sse_dict["time-series-k-means"] = [] label_dict["time-series-k-means"] = {} # silhouette_score_dict["k-shape"] = [] # silhouette_score_dict["global-alignment-kernel-k-means"] = [] for i in range(min_cluster, max_cluster): print(service + "-cluster:" + str(i)) km = TimeSeriesKMeans(n_clusters=i, verbose=True) label = km.fit_predict(X_train) silhouette_score_dict["time-series-k-means"].append( silhouette_score(X_train, label, metric="dtw")) sse_dict["time-series-k-means"].append(km.inertia_) label_dict["time-series-k-means"][i] = label # km = GlobalAlignmentKernelKMeans(n_clusters=i, verbose=True) # label = km.fit_predict(X_train) # silhouette_score_dict["global-alignment-kernel-k-means"].append(silhouette_score(X_train, label, metric="dtw")) # km = KShape(n_clusters=i, verbose=True) # label = km.fit_predict(X_train) # silhouette_score_dict["k-shape"].append(silhouette_score(X_train, label, metric="dtw")) s1 = str(silhouette_score_dict) s2 = str(sse_dict) service = service.replace("/", "-")
dt['UnixTime'] = dt.index.astype(np.int64) // 10**9 dt = dt.fillna(0) evalu = [] for k in range(10): km = TimeSeriesKMeans(n_clusters=k + 2, verbose=True, random_state=23, metric="dtw") Y = km.fit_predict(dt.T) evalu.append(silhouette_score(dt.T, Y, metric="dtw")) # 6 clusteres is best km = TimeSeriesKMeans(n_clusters=7, verbose=True, random_state=23, metric="dtw") Y = km.fit_predict(dt.T) c1 = np.where(Y == 0)[0].tolist() c2 = np.where(Y == 1)[0].tolist() c3 = np.where(Y == 2)[0].tolist() c4 = np.where(Y == 3)[0].tolist() c5 = np.where(Y == 4)[0].tolist()