def elbowmethod(df: pd.DataFrame): """ Function that finds the best number of clusters and plots a graph of the distortion with the number of clusters Args: pd.DataFrame: Columns are (country(index), year_week, value) Returns: int: Best number of cluster """ import numpy as np from tslearn.clustering import silhouette_score import matplotlib.pyplot as plt distortions = [] inertias = [] mapping1 = {} mapping2 = {} K = range(2, df.shape[0]) for k in K: # Building and fitting the model model = TimeSeriesKMeans(n_clusters=k, metric="softdtw", max_iter=50) model.fit(df.values[..., np.newaxis]) distortions.append( silhouette_score(df, model.labels_, metric="softdtw")) plt.plot(K, distortions, 'bx-') plt.xlabel('Values of K') plt.ylabel('Distortion') plt.title('The Elbow Method using Distortion') plt.show() best_num_cluster = np.argmax(distortions) + 2 return best_num_cluster
def do_kmeans(days, km_size): """ From a time series (as a list of df called days), creates km_size clusters using kmeans algo. Parameters ---------- * days: time series to cluster * km_size: number of clusters needed Returns ---------- * km: k-means object generated for the clustering, it contains info about the algorithm * y_pred: results of the clustering, it contains the clusters themselves """ # Arrange data for our lib unq = days["n_day_"].unique() values = [days[days["n_day_"] == l]["val_"].values for l in unq] formatted_dataset = to_time_series_dataset(values) # Configure our kmeans km = TimeSeriesKMeans(n_clusters=km_size, metric="euclidean", random_state=42, verbose=False) y_pred = km.fit_predict(formatted_dataset) return km, y_pred
def cluster_time_series(time_series_data, cluster_method='HDBSCAN', metric='euclidean', n_clusters=4, min_cluster_size=2, min_sample=1): features = time_series_data.T if cluster_method == 'HDBSCAN': clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_sample) clusterer.fit(features) features['cluster'] = clusterer.labels_ feature_dict = features.groupby('cluster').groups if cluster_method == 'kmeans': #kmeans = KMeans(n_clusters=n_clusters) kmeans = TimeSeriesKMeans(n_clusters=n_clusters, metric=metric, max_iter=5, max_iter_barycenter=5, random_state=0) y = kmeans.fit_predict(features) features['cluster'] = y feature_dict = features.groupby('cluster').groups if cluster_method == 'AgglomerativeClustering': AC = AgglomerativeClustering(n_clusters=4) y = AC.fit_predict(features) features['cluster'] = y feature_dict = features.groupby('cluster').groups return feature_dict, features
def get_dds_km(cl_lab, ds, z='LEV0', h0=0, h1=24, nc=4): # %% dds = get_ds_for_dtw_kmeans(cl_lab, ds, z, h0=h0, h1=h1)[C18] from tslearn.clustering import TimeSeriesKMeans from tslearn.metrics import dtw km = TimeSeriesKMeans(nc, metric='dtw', metric_params={'sakoe_chiba_radius': 4}, random_state=789) km.fit(dds.values) _ = km.cluster_centers_ for c in _: plt.plot(c) plt.show() # %% labs = km.predict(dds.values) lb = xr.zeros_like(dds['date'], dtype=int) + labs # %% dds['labs'] = lb # %% dds['labs'].reset_coords(drop=True). \ to_dataframe()['labs'].value_counts(). \ sort_index(). \ plot.bar() plt.show() # %% # dds['hour'] = xr.zeros_like(dds['time'], dtype=float) + \ # np.arange(0, 24, .5) # dds['nday'] = xr.zeros_like( # dds['date'], dtype=int) + \ # np.arange(len(dds['date'])) # dds = dds.swap_dims({'date': 'nday'}) # dds = dds.swap_dims({'time': 'hour'}) return dds, km
def visualize_n_cluster(train_ts, n_lists=[3, 4, 5, 6], metric='dtw', seed=2021, vis=True): if vis: fig = plt.figure(figsize=(20, 5)) plt.title('군집 개수별 건물수 분포', fontsize=15, y=1.2) plt.axis('off') for idx, n in enumerate(n_lists): ts_kmeans = TimeSeriesKMeans(n_clusters=n, metric=metric, random_state=seed) train_ts['cluster(n={})'.format(n)] = ts_kmeans.fit_predict(train_ts) score = round( silhouette_score(train_ts, train_ts['cluster(n={})'.format(n)], metric='euclidean'), 3) vc = train_ts['cluster(n={})'.format(n)].value_counts() if vis: ax = fig.add_subplot(1, len(n_lists), idx + 1) sns.barplot(x=vc.index, y=vc, palette='Pastel1') ax.set(title='n_cluster={0}\nscore:{1}'.format(n, score)) if vis: plt.tight_layout() plt.show() return train_ts
def train_model(train_data, k=300, distance_matrix='eucl'): if distance_matrix == 'dtw': model = TimeSeriesKMeans(n_clusters=k, metric='dtw', n_init=10).fit(train_data) else: model = TimeSeriesKMeans(n_clusters=k, n_init=10).fit(train_data) return model
def xgb_softdtw(train, test, return_pred, num_cluster): ts=TimeSeries_Clustering.make_timeseries(train) formatted_dataset = TimeSeries_Clustering.to_time_series_dataset(ts) X_train, sz = normalize_data(formatted_dataset) sdtw_km = TimeSeriesKMeans(n_clusters=num_cluster, metric="softdtw", metric_params={"gamma_sdtw": .01}, verbose=True,random_state=0) y_pred = sdtw_km.fit_predict(X_train) scores = TimeSeries_Clustering.compute_scores(sdtw_km, X_train, y_pred) plt.boxplot(scores) TimeSeries_Clustering.plot_data(sdtw_km, X_train, y_pred, sz, sdtw_km.n_clusters, centroid=True) y_pred_df = pd.DataFrame(y_pred) userindex = train.user_id.unique() userindex = np.sort(userindex) y_pred_df['user_id'] = userindex test = test.merge(y_pred_df, on='user_id').rename({0: 'cluster_softdtw'}, axis='columns') train = train.merge(y_pred_df, on='user_id').rename({0: 'cluster_softdtw'}, axis='columns') user_cluster = test.drop_duplicates(subset=['user_id'], keep='first')[['user_id', 'cluster_softdtw']] user_cluster.to_csv('\clusters\softdtw_'+str(test.batch.unique())+'.csv') sum_recall = 0 sum_precision = 0 sum_fscore = 0 clusters = test.cluster_softdtw.unique() for cluster in clusters: train_i = train[train['cluster_softdtw'] == cluster] test_i = test[test['cluster_softdtw'] == cluster] recall, precision, fscore = do_xgboost(train_i, test_i, return_pred, num_cluster) sum_recall = sum_recall + recall sum_precision = sum_precision + precision sum_fscore = sum_fscore + fscore n = len(clusters) print('FINISH SOFTDTW') return sum_recall / n, sum_precision / n, sum_fscore / n
def __init__(self, k): self.k = k self.model = TimeSeriesKMeans(n_clusters=k, n_init=2, metric="dtw", verbose=False, max_iter_barycenter=10, random_state=0)
def visualize_n_cluster(train_ts, n_lists=[3,4,5,6],metric='dtw',seed=2021,vis=True): for idx,n in enumerate(n_lists): ts_kmeans=TimeSeriesKMeans(n_clusters=n, metric=metric, random_state=seed) train_ts['cluster(n={})'.format(n)]=ts_kmeans.fit_predict(train_ts) score=round(silhouette_score(train_ts,train_ts['cluster(n={})'.format(n)],metric='euclidean'),3) return train_ts
def cl_KMeansDTW(pivot, original, pixels, metrics, k, cc): kmeans = TimeSeriesKMeans(n_clusters=k, metric="dtw", random_state=42) method = 'KDTW_{0}_{1}'.format(k, cc * 10) newpivot = pivot.values assignments = kmeans.fit_predict(newpivot) return add_computed_typed(method, assignments, original), add_computed_typed_pixels( method, assignments, pixels), add_metrics_typed( method, metrics, assignments, pivot)
def get_cluster_labels(actions, x, n_clusters): km = TimeSeriesKMeans(n_clusters=n_clusters, metric='dtw').fit(x['train']) actions_split = {} for type in ['train', 'dev', 'test']: actions_split[type] = actions[actions['type'] == type] labels = km.predict(x[type]) actions_split[type].loc[:, 'label'] = labels actions = pd.concat( [actions_split[type] for type in ['train', 'dev', 'test']]) return actions
def extract_clusters(self, new_featues): print("Extracting clusters ...") km = TimeSeriesKMeans(n_clusters=2, random_state=42) km.fit(new_featues) y_label = km.labels_ new_featues['km_clusters'] = y_label print("Extracting clusters ... DONE.") return new_featues
def cluster_examples( train: pd.DataFrame, test: pd.DataFrame, dataset_name: str, nclusters: int = 5, n_examples: int = 3, ): """Explore: -cluster series and look at examples from each cluster """ clusterer = TimeSeriesKMeans(n_clusters=nclusters) group_cols = [train.columns[c] for c in grouping_cols[dataset_name]] train_groups = train.groupby(group_cols) test_groups = test.groupby(group_cols) max_l = max( [len(trg) + len(teg) for (_, trg), (_, teg) in zip(train_groups, test_groups)] ) timeseries = [] keys = [] for (group_name, train_group), (_, test_group) in zip(train_groups, test_groups): t_values = train_group.iloc[:, target_cols[dataset_name]].astype(float) t_values = t_values.append( test_group.iloc[:, target_cols[dataset_name]].astype(float) ) t_padded = t_values.append( pd.Series([np.nan] * (max_l - t_values.shape[0])), ) t_padded = t_padded.interpolate() assert len(t_padded) == max_l timeseries.append(t_padded) keys.append(group_name) timeseries_dataset = ts_utils.to_time_series_dataset(timeseries) clusters = clusterer.fit_predict(timeseries_dataset) plot_hist(clusters, "Distribution of Clusters") for i in range(nclusters): print(f"Looking at examples from cluster {i}") idxs = np.where(clusters == i)[0] examples = np.random.choice(idxs, size=n_examples, replace=False) for j, ex in enumerate(examples): query_list = [ f'{grp_col}=="{key}"' for grp_col, key in zip(group_cols, keys[ex]) ] values = train.query(" & ".join(query_list)).iloc[ :, target_cols[dataset_name] ] # values = values.append( # test.query(' & '.join(query_list)).iloc[:, target_cols[dataset_name]] # ) plot_ts(values, f"Example {j} of cluster {i}")
def multi_plot(row, col, fs_tuple, sy_bool, sx_bool, X, num_cluster, lineW, ts=False, labels = None): if ts==True: # model generation model=TimeSeriesKMeans(n_clusters = num_cluster, tol=1e-05, metric='euclidean') fitted_model = model.fit(X) labels = fitted_model.predict(X) f, axes = plt.subplots(row, col, figsize=fs_tuple, sharey=sy_bool, sharex=sx_bool) labelsize=10 fontsize=10 cluster_pool = np.unique(labels) for index, i_cluster in enumerate(cluster_pool): sub_mat = X[labels==i_cluster, :] # unravel figrow, figcol = np.unravel_index(index, dims=[row, col]) # plot if row > 1 and col > 1: for iCurve in range(np.shape(sub_mat)[0]): axes[figrow,figcol].plot(sub_mat[iCurve,:], 'r', linewidth=lineW) # after plot, modify the axes for i_col in range(col): axes[-1,i_col].set_xticks([0,16,32,48,64,80]) axes[-1,i_col].set_xticklabels(str(300+80*(i)) for i in np.arange(6)) axes[-1,i_col].set_xlabel('Wavelength [nm]', fontsize=fontsize) axes[-1,i_col].tick_params(axis='x', labelsize=labelsize) for i_row in range(row): axes[i_row,0].set_yticks([-1,0,1]) axes[i_row,0].tick_params(axis='y', labelsize=labelsize) elif row > 1 and col == 1: for i_curve in range(np.shape(sub_mat)[0]): axes[figrow].plot(sub_mat[i_curve,:], 'r', linewidth=lineW) axes[-1,0].set_xticks([0,16,32,48,64,80]) axes[-1,0].set_xticklabels(str(300+80*(i)) for i in np.arange(6)) axes[-1,0].set_xlabel('Wavelength [nm]', fontsize=fontsize) axes[-1,0].tick_params(axis='x', labelsize=labelsize) for i_row in range(row): axes[i_row,0].set_yticks([-1,0,1]) axes[i_row,0].tick_params(axis='y', labelsize=labelsize) elif row == 1 and col > 1: for i_curve in range(np.shape(sub_mat)[0]): axes[figcol].plot(sub_mat[i_curve,:], 'r', linewidth=lineW) for i_col in range(col): axes[-1,i_col].set_xticks([0,16,32,48,64,80]) axes[-1,i_col].set_xticklabels(str(300+80*(i)) for i in np.arange(6)) axes[-1,i_col].set_xlabel('Wavelength [nm]', fontsize=fontsize) axes[-1,i_col].tick_params(axis='x', labelsize=labelsize) axes[0,0].set_yticks([-1,0,1]) axes[0,0].tick_params(axis='y', labelsize=labelsize) return (f, axes)
def clustering(df, n_cluster: int = 2, metric: str = 'softdtw', init='k-means++', random_state=1234, verbose=False, n_init=1): tsk = TimeSeriesKMeans(n_clusters=n_cluster, metric=metric, init=init, random_state=random_state, verbose=verbose, n_init=n_init) df = np.roll(df, -6, axis=0) M = to_time_series_dataset(df.T) cluster_labels = tsk.fit_predict(M) return cluster_labels
def k_init(self, v = True): """ initialisation de l'instance de l'algorithm avec les parametres actuels Parameters: * v: boolean Verbose, affiche les info lie au partitionnement Returns: NA """ self.km = TimeSeriesKMeans(n_clusters = self.n, metric = self.metric, metric_params = {"gamma_sdtw": .01}, verbose = v, random_state = self.seed)
def cluster_annotation_dimension(data, n_clusters=3): data = np.array([list(moving_average(d, 2)) for d in data]) clustering = TimeSeriesKMeans(n_clusters=n_clusters, metric="euclidean") clustering.fit(data) clusters = [] for cluster_ix in range(n_clusters): cl = np.where(clustering.labels_ == cluster_ix)[0] clusters.append(data[cl]) return clusters
def plot_KMeansDTW_elbow(pivot): maxK = 2 maxSil = -1 for i in range(2, 20): kmeans = TimeSeriesKMeans(n_clusters=i, metric="dtw", random_state=42) labels = kmeans.fit_predict(pivot) silhouette = silhouette_score(pivot, labels, sample_size=10000) print("For n_clusters =", i, "The average silhouette_score is :", silhouette, "SSE is :", kmeans.inertia_) if silhouette > maxSil: maxSil = silhouette maxK = i return maxK
def init(X, l, k): # Good initial start improves the convergence speed seed = 0 sdtw_km = TimeSeriesKMeans(n_clusters=k, metric="euclidean", max_iter=10, random_state=seed) sdtw_km.fit(X) G_init = np.zeros((sdtw_km.labels_.size, sdtw_km.labels_.max() + 1)) G_init[np.arange(sdtw_km.labels_.size), sdtw_km.labels_] = 1 G_init = G_init + np.random.rand(G_init.shape[0], G_init.shape[1]) F_init = sdtw_km.cluster_centers_[:, :, 0] + 2**psi * np.random.rand(k, l) return F_init, G_init
def clustering_TimeSeriesKMeans(tsdata, n_clusters, random_state, n_init, metric="softdtw", metric_params={"gamma_sdtw": 0.01}): np.random.seed(random_state) # Instantiate of TimeSeriesKMeans Class dtw_km = TimeSeriesKMeans( n_clusters=n_clusters, n_init=n_init, metric=metric, metric_params=metric_params, verbose=True, random_state=random_state ) y_pred = dtw_km.fit_predict(tsdata) return y_pred
def tsKMeans_num_cluster(X, n_trials, max_n_cluster): min_n_cluster = 2 v_clusters = np.arange(min_n_cluster, max_n_cluster) n_seeds = n_trials # recorder sc_recorder = np.zeros((len(v_clusters),n_seeds)) for i_seed in range(n_seeds): for num_cluster in v_clusters: model=TimeSeriesKMeans(n_clusters = num_cluster, tol=1e-05, metric='euclidean', random_state=i_seed) fitted_model = model.fit(X) y_pred= fitted_model.predict(X) s_sc = sklearn.metrics.silhouette_score(X, y_pred, metric='euclidean') sc_recorder[num_cluster-min_n_cluster, i_seed]=s_sc return sc_recorder
def test_serialize_timeserieskmeans(): n, sz, d = 15, 10, 3 rng = numpy.random.RandomState(0) X = rng.randn(n, sz, d) dba_km = TimeSeriesKMeans(n_clusters=3, n_init=2, metric="dtw", verbose=True, max_iter_barycenter=10) _check_not_fitted(dba_km) dba_km.fit(X) _check_params_predict(dba_km, X, ['predict']) sdtw_km = TimeSeriesKMeans(n_clusters=3, metric="softdtw", metric_params={"gamma": .01}, verbose=True) _check_not_fitted(sdtw_km) sdtw_km.fit(X) _check_params_predict(sdtw_km, X, ['predict'])
def get_fitted_model(dataset, clusters = 3, timepoints = 0, verbose = False, return_dataset = True): if timepoints!=0: dataset = dataset[:,:timepoints] if verbose: print(f'Segmenting data into {clusters} clusters...') model = TimeSeriesKMeans( n_clusters = clusters, n_init = 10, metric = 'dtw', #Dynamic time warping verbose = verbose, n_jobs = -1 #Use all cores ) model.fit(dataset) return model
def test_variable_length_clustering(): # TODO: here we just check that they can accept variable-length TS, not # that they do clever things X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9], [3, 5, 6, 7, 8]]) rng = np.random.RandomState(0) clf = KernelKMeans(n_clusters=2, random_state=rng) clf.fit(X) clf = TimeSeriesKMeans(n_clusters=2, metric="dtw", random_state=rng) clf.fit(X) clf = TimeSeriesKMeans(n_clusters=2, metric="softdtw", random_state=rng) clf.fit(X)
def dmp_cluster_planning_history(self, k=2): self.dmp_traj_clusters = [] from tslearn.clustering import TimeSeriesKMeans kmeans = TimeSeriesKMeans(n_clusters=k).fit(self.history.paths) # fit all of these to different DMPS then cluster for i in range(self.history.paths.shape[0]): path_data = self.history.paths[i] formatted_path_data = path_data.reshape( (1, path_data.shape[1], path_data.shape[0])) center = formatted_path_data[0] new_dmp_traj_cluster = DMPTrajCluster( formatted_path_data, center, execution_times=[self.history.execution_times[i]], rl=True) self.dmp_traj_clusters.append(new_dmp_traj_cluster) # cluster based on weights weight_list = [dmp.dmp.w for dmp in self.dmp_traj_clusters] weights_to_cluster = np.zeros( (len(self.dmp_traj_clusters), weight_list[0].shape[1] * weight_list[0].shape[0])) for i in range(len(weight_list)): weights_to_cluster[i] = weight_list[i].flatten() self.gmm = mixture.GaussianMixture( n_components=3, covariance_type='full').fit(weights_to_cluster) import ipdb ipdb.set_trace()
def traj_cluster_planning_history(self, k=2): self.dmp_traj_clusters = [] from tslearn.clustering import TimeSeriesKMeans kmeans = TimeSeriesKMeans(n_clusters=k).fit(self.history.paths) # sort into groups, make them into a dmp_traj_cluster, all into one cluster for now for i in range(k): center = kmeans.cluster_centers_[i] relevant_labels = np.where(kmeans.labels_ == i)[0] if len(relevant_labels) == 0: print("Empty cluster") continue execution_times = np.array( self.history.execution_times)[relevant_labels] path_data = self.history.paths[relevant_labels] formatted_path_data = np.zeros( (path_data.shape[0], path_data.shape[2], path_data.shape[1])) # gets ugly if there's only one now n_training_paths = path_data.shape[0] for i in range(n_training_paths): formatted_path_data[i] = path_data[i].T new_dmp_traj_cluster = DMPTrajCluster( formatted_path_data, center, execution_times=execution_times, rl=True) self.dmp_traj_clusters.append(new_dmp_traj_cluster)
def fit_tskmeans(self): data_scaled = self.data_scaled center = self.center_num km = TimeSeriesKMeans(n_clusters=center, metric="softdtw", max_iter=5, verbose=False, random_state=0).fit( data_scaled) self.fietted_cluster = km self.labels = km.labels_
def tsclusteringN(ts_data, names): # クラスタリング # 正規化 ts_dataset = TimeSeriesScalerMinMax().fit_transform(ts_data) metric = 'dtw' n_clusters = [n for n in range(2, 6)] for n in n_clusters: print('クラスター数 =', n) # metricが「DTW」か「softdtw」なら異なるデータ数の時系列データでもOK km = TimeSeriesKMeans(n_clusters=n, metric=metric, verbose=False, random_state=1).fit(ts_dataset) # クラスタリングの結果 print('クラスタリング結果 =', km.labels_) # -1から1の範囲の値。シルエット値が1に近く、かつシルエット値をプロットしたシルエット図でクラスター間の幅の差が最も少ないクラスター数が最適 # 今回はシルエット値のみを確認 print('シルエット値 =', silhouette_score(ts_dataset, km.labels_, metric=metric)) print()
def knn(p_id): # get_session_data(p_id) (exists, test) = get_session_data(p_id) if not exists: print("Running KNN Model") s_id = crud.get_latest_session_table_by_id(db, p_id).split('_')[3] model = TimeSeriesKMeans.from_json('res/knn_model.txt') pred = model.predict(test) leng = len(test) more = int(0.8 * leng) less = int(0.2 * leng) if more + less < leng: more += 1 elif more + less > leng: more -= 1 a = np.zeros((more,), dtype=int) b = np.ones((less,), dtype=int) true = np.concatenate([a, b]) res = 0 if (pred == 0).sum() < (pred == 1).sum(): res = 1 print("-----------------------------") print(exists) print("-----------------------------") res = models.Result(session_id=int(s_id), patient_id=int(p_id), result=int(res), model_id=0) return crud.create_patient_result(db, res) else: return crud.get_last_result(db, p_id, m_id) # return crud.get_last_result_by_patient_id(db, p_id) print(pred) print(confusion_matrix(true, pred))
def _kmeans_init_shapelets(X, n_shapelets, shp_len, n_draw=10000): n_ts, sz, d = X.shape indices_ts = numpy.random.choice(n_ts, size=n_draw, replace=True) indices_time = numpy.random.choice(sz - shp_len + 1, size=n_draw, replace=True) subseries = numpy.zeros((n_draw, shp_len, d)) for i in range(n_draw): subseries[i] = X[indices_ts[i], indices_time[i]:indices_time[i] + shp_len] return TimeSeriesKMeans(n_clusters=n_shapelets, metric="euclidean", verbose=False).fit(subseries).cluster_centers_