def cal_k_shape(self, data, num_cluster): """ use best of cluster :param df: time series dataset :param num_cluster: :return:cluster label """ ks = KShape(n_clusters=num_cluster, n_init=5, verbose=True, random_state=self.seed) y_pred = ks.fit_predict(data) return y_pred
def clustering_Kshape(tsdata, n_clusters, random_state, n_init, max_iter=100): np.random.seed(random_state) # Need to be normalized to calculate cross correlation # stack_data = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(stack_data) # Instantiate of KShape Class ks = KShape( n_clusters=n_clusters, n_init=n_init, verbose=True, random_state=random_state, max_iter=max_iter ) y_pred = ks.fit_predict(tsdata) return y_pred
def do_kshape(days, km_size): """ From a time series (as a list of df called days), creates km_size clusters using kshape algo. """ # Arrange data for our lib unq = days["n_day_"].unique() values = [days[days["n_day_"] == l]["val_"].values for l in unq] formatted_dataset = to_time_series_dataset(values) # Configure our kmeans kshape = KShape(n_clusters=km_size, random_state=42, verbose=False) y_pred = kshape.fit_predict(formatted_dataset) return kshape, y_pred
def kshape(container: DataFrameContainer, data_column: str, n_clusters: int, max_iter: int = 300, tol: float = 1e-6, n_init: int = 1, verbose: bool = True, random_state: Union[int, None] = None, init: np.ndarray = 'random', centroid_seeds: np.ndarray = None): """ :param container: :param data_column: :param n_clusters: :param max_iter: :param tol: :param n_init: :param verbose: :param random_state: :param init: :param centroid_seeds: arrays of shape [n_clusters, ts_size] :return: """ if centroid_seeds is not None: init = np.swapaxes(np.array([centroid_seeds]).T, 0, 1) ks = KShape(n_clusters=n_clusters, max_iter=max_iter, tol=tol, n_init=n_init, verbose=verbose, random_state=random_state, init=init) X = np.vstack(container.dataframe[data_column].values) y = ks.fit_predict(X) container.dataframe['KSHAPE_CLUSTER'] = y return container
def plot_best_shape(self, data, num_cluster): """ time series cluster plot :param df: :param num_cluster: :return: """ ks = KShape(n_clusters=num_cluster, n_init=5, verbose=True, random_state=self.seed) y_pred = ks.fit_predict(data) for yi in range(num_cluster): for xx in data[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.3) plt.plot(ks.cluster_centers_[yi].ravel(), "r-") plt.text(0.55, 0.85, 'Cluster %d' % (yi + 1), transform=plt.gca().transAxes) plt.tight_layout() plt.show()
from tslearn.clustering import KShape from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance seed = 0 numpy.random.seed(seed) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = X_train[y_train < 4] # Keep first 3 classes numpy.random.shuffle(X_train) X_train = TimeSeriesScalerMeanVariance().fit_transform( X_train[:50]) # Keep only 50 time series sz = X_train.shape[1] # Euclidean k-means ks = KShape(n_clusters=3, verbose=True, random_state=seed) y_pred = ks.fit_predict(X_train) plt.figure() for yi in range(3): plt.subplot(3, 1, 1 + yi) for xx in X_train[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(ks.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(-4, 4) plt.title("Cluster %d" % (yi + 1)) plt.tight_layout() plt.show()
def mass_upload(startDate, endDate, id_unit_usaha): print(id_unit_usaha) login = "" password = "" # engine = sqlalchemy.create_engine('mysql+pymysql://energy:energy2x5=10@localhost:3306/pgn') engine = sqlalchemy.create_engine( 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') sql = " SELECT a.IDREFPELANGGAN, a.ID_UNIT_USAHA, 1 AS FSTREAMID, DATEPART(dw, a.FDATETIME) as FDAYOFWEEK, a.FHOUR, AVG(a.FDVC) as AVG_FDVC\ FROM(SELECT IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR, SUM(FDVC) as FDVC\ FROM amr_bridge\ WHERE FDATETIME >= '" + startDate + "'\ and FDATETIME < '" + endDate + "'\ GROUP BY IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR) a\ GROUP BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR\ ORDER BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR" df = pd.read_sql_query(sql, engine) totaldf = len(df) totaldf = str(totaldf) print('total Data: ' + totaldf) # rslt_df = df.loc[df['ID_UNIT_USAHA'] == '014'] # print(startDate) # print('\nResult dataframe :\n', rslt_df) # df.to_csv('pgn_customer_cluster_v1_{}.csv'.format(id_unit_usaha), index=False) # df.to_hdf("amr_bridge_22122020.hdf", key='hdf5') # df = pd.read_hdf("amr_bridge_22122020.hdf") def select_data(id_unit): query = "ID_UNIT_USAHA == '{}'".format(id_unit_usaha) columns = ['FDAYOFWEEK', 'FHOUR', 'IDREFPELANGGAN', 'AVG_FDVC'] # df = df.set_index('FDATETIME') df_selected = df.query(query, engine='python')[columns] return df_selected def pivot_data(df): # df_pivoted = df.pivot(index='FDATETIME', columns='IDREFPELANGGAN', values='FDVC') df_pivoted = df.pivot(index=['FDAYOFWEEK', 'FHOUR'], columns='IDREFPELANGGAN', values='AVG_FDVC') return df_pivoted def remove_zerocolumns(df): # Get all columns which have all zero values cols = df.columns[df.mean() == 0] # Drop columns which has all zero values df = df.drop(cols, axis=1) return df df_week1 = select_data(id_unit_usaha) df_week1.fillna(0.0, inplace=True) df_pivoted1 = pivot_data(df_week1) df_pivoted1.fillna(0.0, inplace=True) df_pivoted1 = remove_zerocolumns(df_pivoted1) cols = list(df_pivoted1.columns) df_pivoted1.head() # Function to plot cluster # def plot_clusters(ds, y_pred, n_clusters, ks, filename): # plt.figure(figsize=(12, 40)) # for yi in range(n_clusters): # plt.subplot(n_clusters, 1, 1 + yi) # for xx in ds[y_pred == yi]: # plt.plot(xx.ravel(), "k-", alpha=.2) # plt.plot(ks.cluster_centers_[yi].ravel(), "r-") # plt.xlim(0, sz) # plt.ylim(-7, 7) # plt.title("Cluster %d" % (yi)) # plt.tight_layout() # plt.savefig(filename, format='jpg', dpi=300, quality=95) # plt.show() def create_cluster_info(y_pred, cols): df_cluster = pd.DataFrame(y_pred.copy(), index=cols.copy(), columns=['cluster']) df_cluster.reset_index(inplace=True) df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True) unique_cluster = df_cluster['cluster'].unique() # Get ID ref based on cluster idrefs_list = [] for i, x in enumerate(unique_cluster): idref_list = df_cluster.query( "cluster == {}".format(x))['idrefpelanggan'].values.tolist() # idrefs_list[x] = idref_list # Create dictionary idref_cluster_dict = {'cluster': x, 'idrefpelanggan': idref_list} idrefs_list.append(idref_cluster_dict) idrefs_cluster = pd.DataFrame(idrefs_list) return idrefs_cluster # def run_once(startime, totalData, _has_run=[]): # if _has_run: # return # # print("run_once doing stuff") # print(startime) # endtime = time.time_ns() # print(endtime) # invTime = endtime-startime # estTime = invTime * totalData # _has_run.append(1) # print(totalData) # print(estTime) # return estTime seed = 0 np.random.seed(seed) # Convert data frame to list of series pivoted_series = [] pivoted_columns = [] for i, y in enumerate(cols): length = len(df_pivoted1[y]) cst = df_pivoted1[y].values pivoted_series.append(cst) pivoted_columns.append(y) # Convert data set to standar time series format formatted_dataset = to_time_series_dataset(pivoted_series) print("Data shape: {}".format(formatted_dataset.shape)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) sz = formatted_norm_dataset.shape[1] print("Data shape: {}".format(sz)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) clusters = 5 totalColumn = formatted_norm_dataset.shape[0] totalRow = formatted_norm_dataset.shape[1] totalData = totalRow * totalColumn + totalRow * clusters ks = KShape(n_clusters=clusters, verbose=True, random_state=seed) y_pred_ks = ks.fit_predict(formatted_norm_dataset) formatted_norm_dataset.shape data = formatted_norm_dataset data.shape formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0] formatted_norm_dataset_2d.shape # pd.DataFrame(A.T.reshape(2, -1), columns=cols) df_normalized = pd.DataFrame(formatted_norm_dataset_2d) df_normalized # df_normalized = df_normalized.pivot() # formatted_norm_dataset[0] df_cluster = pd.DataFrame(y_pred_ks, index=pivoted_columns, columns=['cluster']) df_cluster.reset_index(inplace=True) df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True) df_cluster.sort_values(['cluster']) df_normalized_detail = pd.DataFrame.join(df_normalized, df_cluster) df_normalized_detail # df_cluster.to_csv('pgn_customer_cluster_{}.csv'.format( # id_unit_usaha), index=False) # Create data frame for customer and its cluster create_cluster_info(y_pred_ks, cols) # plot_clusters(formatted_norm_dataset, y_pred_ks, clusters, ks, # 'pgn_customer_cluster_{}.jpg'.format(id_unit_usaha)) # engine2 = sqlalchemy.create_engine( # 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') # Session = sessionmaker(bind=engine2) # session = Session() # Base = declarative_base() # class PL_CUSTOMER_CLUSTER(Base): # __tablename__ = 'PL_CUSTOMER_CLUSTER' # ID = Column(Integer, primary_key=True) # DATE_STAMP = Column(DateTime) # IDREFPELANGGAN = Column(String(30)) # HOUR_NUM = Column(Integer) # CLUSTER_NUM = Column(Integer) # HOUR_NUM = Column(Integer) # FDVC_NORMALIZED = Column(Float) # AREA_ID = Column(String(5)) # startime = time.time_ns() # for i in range(totalColumn): # idref = df_normalized_detail.iloc[i, totalRow] # cluster = int(df_normalized_detail.iloc[i, totalRow+1]) # print("idref = " + idref) # cluster_num = df_normalized_detail.iloc[i, totalRow-1] # for j in range(totalRow): # hour_num = df_normalized_detail.columns[j] # fdvc = df_normalized_detail.iloc[i, j] # sql = "" # # insert into table # item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=idref, # HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc, AREA_ID=id_unit_usaha) # session.add(item) # # commit per id ref pelanngan # session.commit() engine2 = sqlalchemy.create_engine( 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') Session = sessionmaker(bind=engine2) session = Session() Base = declarative_base() class PL_CUSTOMER_CLUSTER(Base): __tablename__ = 'PL_CUSTOMER_CLUSTER' ID = Column(Integer, primary_key=True) DATE_STAMP = Column(DateTime) IDREFPELANGGAN = Column(String(30)) HOUR_NUM = Column(Integer) CLUSTER_NUM = Column(Integer) HOUR_NUM = Column(Integer) FDVC_NORMALIZED = Column(Float) AREA_ID = Column(String(5)) df_normalized_detail for i in range(5): print("cluster: " + str(i)) CLUSTER_NAME = "CENTROID_ID" + str(i) cluster = i for j in range(totalRow): fdvc_norm = ks.cluster_centers_[i][j][0] hour_num = j sql = "" item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=CLUSTER_NAME, HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc_norm, AREA_ID=id_unit_usaha) session.add(item) print("fdvc:" + str(fdvc_norm) + "Hour:" + str(hour_num)) # commit per id ref pelanngan session.commit() print(str(j) + ", " + str(fdvc_norm)) return totalData
# scale mean around zero input_waves = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(waves) # run clustering or skip and load results if desired if skipClustering: clustFile = h5py.File( templatePath + str(numCluster) + "/" + str(numCluster) + "_cluster_predictions_" + str(prefiltFreq[0]) + "-" + str(prefiltFreq[1]) + "Hz.h5", "r") pred = np.array(list(clustFile["cluster_index"])) centroids = list(clustFile["centroids"]) clustFile.close() else: print("Clustering...") ks = KShape(n_clusters=numCluster, n_init=1, random_state=0) pred = ks.fit_predict(input_waves) clustFile = h5py.File( templatePath + str(numCluster) + "/" + str(numCluster) + "_cluster_predictions_" + str(prefiltFreq[0]) + "-" + str(prefiltFreq[1]) + "Hz.h5", "w") clustFile.create_dataset("cluster_index", data=pred) clustFile.create_dataset("centroids", data=ks.cluster_centers_) clustFile.create_dataset("inertia", data=ks.inertia_) clustFile.close() modelFile = templatePath + str(numCluster) + "/" + str( numCluster) + "_cluster_model_" + str(prefiltFreq[0]) + "-" + str( prefiltFreq[1]) + "Hz.h5" ks.to_hdf5(modelFile)
print("Data shape: {}".format(formatted_dataset.shape)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) sz = formatted_norm_dataset.shape[1] print("Data shape: {}".format(sz)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) totalColumn = formatted_norm_dataset.shape[0] totalRow = formatted_norm_dataset.shape[1] clusters = 5 ks = KShape(n_clusters=clusters, verbose=True, random_state=seed) y_pred_ks = ks.fit_predict(formatted_norm_dataset) formatted_norm_dataset.shape data = formatted_norm_dataset data.shape formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0] formatted_norm_dataset_2d.shape #pd.DataFrame(A.T.reshape(2, -1), columns=cols) df_normalized = pd.DataFrame(formatted_norm_dataset_2d) df_normalized #df_normalized = df_normalized.pivot() # formatted_norm_dataset[0] df_cluster = pd.DataFrame(y_pred_ks, index=pivoted_columns,
def k_shape(X_train, n_clusters, verbose=True, seed=0): # Euclidean k-means ks = KShape(n_clusters=n_clusters, verbose=verbose, random_state=seed) return ks, ks.fit_predict(X_train)
class Kshape(cs): """Classe de partitionnement des donnees avec l'algorithm K-shape Parameters: * ss : SeriesSupp instance du manager de series temporelles Variables: * seed: int Valeur d'initialisation de l'algo, random. * counter: Counter repartition des objets au sein des clusters * km: TimeSeriesKMeans Instance de l'algo * clust_name: String Nom de l'algo(affichage des plots) * metric: String Choix du metrics utilise, principalement softdtw ici car tres efficace et rapide """ def __init__(self, ss): super().__init__(ss) self.seed = 0 np.random.seed(self.seed) self.counter = None self.km = None self.clust_name = "Kshape" self.metric = "shape" def k_init(self, v=True): """ initialisation de l'instance de l'algorithm avec les parametres actuels Parameters: * v: boolean Verbose, affiche les info lie au partitionnement Returns: NA """ self.km = KShape(n_clusters=self.n, verbose=v, random_state=self.seed) def k_fit(self): """ Effectue le partitionnement Parameters: NA Returns: NA """ self.ts_clust = self.km.fit_predict(self.ts) def cluster_counter(self): """ Compte les objets au sein des clusters Parameters: NA Returns: NA """ self.counter = Counter(self.ts_clust)
print(df) # ...........Making input environment.............. my_time_series = [] for i in range(df.shape[0]): # for i in range(50): my_time_series.append(df.iloc[i].values) formatted_dataset = to_time_series_dataset(my_time_series) print(formatted_dataset.shape) ks = KShape(n_clusters=100, verbose=True) # ks=KShape(n_clusters=10,verbose=True) y_pred = ks.fit_predict(formatted_dataset) print(y_pred) centroid = ks.cluster_centers_ centroid = centroid.reshape((centroid.shape[0], centroid.shape[1])) print(centroid.shape) # np.savetxt("Results/centroid.csv", centroid, delimiter=",") pd.DataFrame(centroid).to_csv('Results/centroid_lv10.csv') # D = cdist( formatted_dataset.reshape( (formatted_dataset.shape[0], formatted_dataset.shape[1])), centroid) print(D.shape) selected_ts = np.argmin(D.T, axis=1) pd.DataFrame(selected_ts).to_csv('Results/selected_ts_lv12.csv') pd.DataFrame(y_pred).to_csv('Results/clusterprediction_lv12.csv')
class Kshape(): """ Input time_span data data is pd.DataFrame data columns are [DEVICE_DATETIME, TEMPRATURE] where DEVICE_DATETIME is index. data is must be sorted by index, ascendings = True. data has taken every 10 seconds. time_span = 1 means 1 timeseries = 1 minutes data. batch is the number of elements what using 1 timeseris has. """ def __init__( self, time_span=1, batch=60, data=None, ): self.time_span = time_span * 6 self.data = data self.batch = batch self.km = KShape(n_clusters=2, max_iter=50, verbose=True, random_state=0) def Preprocess(self, x=None): """ dataを(batch, len(data)//time_span)の形に整形する。 """ if str(type(x)) == "<class 'NoneType'>": self.n_data = len(self.data) // self.time_span self.n_use = self.time_span * self.n_data ts = self.data.loc[:self.data.index[self.n_use - 1]] ts = np.array(ts.TEMPERATURE).reshape(1, -1) ts = TimeSeriesScalerMeanVariance().fit_transform(ts) ts = np.array(ts).reshape(self.n_data, -1) ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts) self.ts = ts else: self.x_data = len(x) // self.time_span self.x_use = self.time_span * self.x_data ts = x.loc[:x.index[self.x_use - 1]] ts = np.array(ts.TEMPERATURE).reshape(1, -1) ts = TimeSeriesScalerMeanVariance().fit_transform(ts) ts = np.array(ts).reshape(self.x_data, -1) ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts) return ts def classification(self): """ KShape で分類する。 使わなかったデータは、TimeSeriesResampler でかさ増しして使う 分類後に、self.data にcluster 列を作る """ self.Preprocess() self.y_pred = self.km.fit_predict(self.ts) #cluster 列を作る self.cluster = [] for i in range(self.n_data): list_item = [self.y_pred[i]] * self.time_span self.cluster.extend(list_item) #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。 if not self.n_use == len(self.data): self.ts_c = self.data.loc[self.data.index[self.n_use]:] self.ts_c = np.array(self.ts_c.TEMPERATURE).reshape(1, -1) self.ts_batch = TimeSeriesResampler(sz=self.batch).fit_transform( self.ts_c) self.y_pred_c = [int(self.km.predict(self.ts_batch)) ] * self.ts_c.shape[1] self.cluster.extend(self.y_pred_c) self.data["CLUSTER"] = self.cluster def draw_graph(self, x=None): if str(type(x)) == "<class 'NoneType'>": fig, ax = plt.subplots() sns.scatterplot(data=self.data, x="DEVICE_DATETIME", y="TEMPERATURE", hue="CLUSTER") locator = mdates.AutoDateLocator(minticks=4, maxticks=10) formatter = mdates.ConciseDateFormatter(locator=locator) ax.xaxis.set_major_locator(locator) ax.xaxis.set_major_formatter(formatter) plt.show() else: fig, ax = plt.subplots() sns.scatterplot(data=x, x="DEVICE_DATETIME", y="TEMPERATURE", hue="CLUSTER") locator = mdates.AutoDateLocator(minticks=4, maxticks=10) formatter = mdates.ConciseDateFormatter(locator=locator) ax.xaxis.set_major_locator(locator) ax.xaxis.set_major_formatter(formatter) plt.show() def predict(self, x): ts = self.Preprocess(x=x) pred = self.km.predict(ts) cluster = [] for i in range(self.x_data): list_item = [pred[i]] * self.time_span cluster.extend(list_item) #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。 if not self.x_use == len(x): self.x_c = x.loc[x.index[self.x_use]:] self.x_c = np.array(self.x_c.TEMPERATURE).reshape(1, -1) self.x_batch = TimeSeriesResampler(sz=self.batch).fit_transform( self.x_c) y_pred_c = [int(self.km.predict(self.x_batch))] * self.x_c.shape[1] cluster.extend(y_pred_c) x["CLUSTER"] = cluster self.draw_graph(x=x)
# # print("cosine:", distance) # plt.imshow(distance) # plt.show() # In[2] np.random.shuffle(all_data) print(all_data.shape) # For this method to operate properly, prior scaling is required x_train = TimeSeriesScalerMeanVariance().fit_transform(all_data) sz = x_train.shape[1] # kShape clustering seed = 0 ks = KShape(n_clusters=2, verbose=True, random_state=seed) y_pred = ks.fit_predict(x_train) print(x_train.shape) print(y_pred.shape) plt.figure() for yi in range(2): N = len(x_train[y_pred == yi]) ind = 0 for xx in x_train[y_pred == yi]: plt.subplot(N, 2, 2*ind+yi+1) plt.plot(xx) ind += 1 # plt.plot(xx.ravel(), "k-", alpha=.2) # plt.plot(ks.cluster_centers_[yi].ravel(), "r-") # plt.xlim(0, sz)
from tslearn.clustering import KShape hum_sub = np.loadtxt('../../HUM_subs.csv', delimiter=',', skiprows=0) print(hum_sub.shape) X = to_time_series_dataset(hum_sub) print(X.shape) X = TimeSeriesScalerMeanVariance().fit_transform(X) sz = X.shape[1] seed = 4 np.random.seed(seed) nclust = 3 ks = KShape(n_clusters=nclust, verbose=True, random_state=seed) y_pred = ks.fit_predict(X) print(y_pred + 1) print(len(y_pred)) # for i,j in enumerate(y_pred+1): # if j == 2: # print(i+1) plt.figure() for yi in range(nclust): plt.subplot(nclust, 1, 1 + yi) for xx in X[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=0.2) plt.plot(ks.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz)
def main(argv): # define global timer to obtain global execution time start_global = timer() # define globals variables global euclidean_clustered_data, \ dtw_clustered_data, \ soft_dtw_clustered_data, \ k_shape_clustered_data, \ gak_clustered_data ############################################################################################# # Input arguments parsing ############################################################################################# # define help message help_message = \ 'clustering.py -h \n\n' \ 'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \ 'by default: processing input data (without any sampling)' \ '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \ 'options list: \n' \ ' -c / --clusters <number_clusters> # set number of clusters (default 3) \n\n' \ ' -i / --ifile <input_file> # set input filename \n' \ ' -n / --normalise # normalise input data \n' \ ' -s / --standardise # standardise input data \n\n' \ ' -a / --all # perform all 5 implemented methods of clustering: \n' \ ' euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \ ' -E / --euclidean # perform euclidean k-means clustering \n' \ ' -D / --dtw # perform dtw k-means clustering \n' \ ' -S / --soft-dtw # perform soft-dtw k-means clustering \n' \ ' -K / --k-shape # perform k-shape clustering \n' \ ' -G / --gak # perform GAK k-means clustering \n' # Create new object to save arguments i_args = Arguments() # number of rows in plot to create correct number of subplots # default = 3 (raw data plus distribution histograms) n_rows_plot = 3 # define validation rules for arguments try: opts, args = getopt.getopt( argv, "hc:i:nsaEDSKG", [ "help", "clusters=", "ifile=", "normalise", "standardise", "all", "euclidean", "dtw", "soft-dtw", "k-shape", "gak" ] ) except getopt.GetoptError: print(help_message) sys.exit(2) # parse arguments for opt, arg in opts: if opt in ("-h", "--help"): print(help_message) sys.exit() elif opt in ("-c", "--clusters"): i_args.number_clusters = arg elif opt in ("-i", "--ifile"): i_args.input_file = arg elif opt in ("-n", "--normalise"): i_args.normalise_data = True elif opt in ("-s", "--standardise"): i_args.standardise_data = True elif opt in ("-E", "--euclidean"): n_rows_plot += 1 i_args.euclidean_clustering = True elif opt in ("-D", "--dtw"): n_rows_plot += 1 i_args.dtw_clustering = True elif opt in ("-S", "--soft-dtw"): n_rows_plot += 1 i_args.soft_dtw_clustering = True elif opt in ("-K", "--k-shape"): n_rows_plot += 1 i_args.k_shape_clustering = True elif opt in ("-G", "--gak"): n_rows_plot += 1 i_args.gak_clustering = True elif opt in ("-a", "--all"): n_rows_plot = 8 i_args.euclidean_clustering = True i_args.dtw_clustering = True i_args.soft_dtw_clustering = True i_args.k_shape_clustering = True i_args.gak_clustering = True # normalise maximum number of subplots levels n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot ############################################################################################# # Raw data processing stage ############################################################################################# # set style to matplotlib plot mpl.style.use('seaborn') # set seed value and seed the generator seed = 0 numpy.random.seed(seed) # import data and print first 5 rows raw_data = import_data() print(raw_data.head()) # convert raw data to the format which can be used by tslearn # (3-d dimensional array) # BUILT functionality: adjust all time series to one size # (NaN values are appended to the shorter ones) formatted_data = to_time_series_dataset(raw_data) # print shape of new array print(formatted_data.shape) # obtain number of measuring n_measuring = formatted_data.shape[1] # define figure, grid_spec to create layout of the plot fig = plt.figure(constrained_layout=True) grid_spec = fig.add_gridspec( n_rows_plot, i_args.number_clusters ) # set A4 size to figure fig.set_size_inches(8.5, 11.75) # setup count of layers of subplots count_layer = 3 # setup first subplot and draw raw time series f_ax_raw_data = fig.add_subplot(grid_spec[:2, :]) for xx in formatted_data: f_ax_raw_data.plot(xx.ravel(), alpha=.2) formatted_data_min = formatted_data.min() formatted_data_max = formatted_data.max() # draw title for chart with min and max values f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max)) # obtain and print executing time of data processing stage to console, timer_tick = get_time_tick(start_global) plt.ion() plt.show() print("Raw data processing time: %s" % timer_tick) ############################################################################################# # Data preprocessing stage ############################################################################################# start = timer() # Convert NaNs to value predicted by interpolation # linearly interpolate for NaN/NaNs n_nan_changes = 0 for ind in range(formatted_data.shape[0]): mask = numpy.isnan(formatted_data[ind]) n_nan_changes += mask.sum() formatted_data[ind][mask] = numpy.interp( numpy.flatnonzero(mask), numpy.flatnonzero(~mask), formatted_data[ind][~mask] ) print("%d NaN values was/were interpolated" % n_nan_changes) # Scaling # to know should we use normalization or standardization, we need to see # the distribution of values. # take random 3 measuring for each case to draw histograms random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False) # create new arrays with values of randomly chosen measurements histogram_data = formatted_data[:, random_indexes] # draw histograms for i_histogram in range(i_args.number_clusters): f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram]) f_ax_histogram.hist( histogram_data[:, i_histogram], bins=25, density=True ) f_ax_histogram.text(0.55, 0.98, 'Measurement #%d' % random_indexes[i_histogram], transform=plt.gca().transAxes, color="navy" ) if i_histogram == 1: preprocessing = '' if i_args.normalise_data: preprocessing += "normalised" if i_args.standardise_data: preprocessing += " and standardised" elif i_args.standardise_data: preprocessing += "standardised" preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing f_ax_histogram.set_title( "Distributions histograms %s" % preprocessing, color='navy', y=1, pad=14 ) # if no processing data option chosen continue with raw data processed_data = formatted_data # since for this concrete challenge data the distributions are more/less # Gaussian/Normal we can use standardization # normalize data: Min-Max scaling ranging between 0 and 1 if i_args.normalise_data: processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data) print("Data was normalised") # standardize data: scaling technique where the values are centered around # the mean with a unit standard deviation if i_args.standardise_data: processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data) print("Data was standardised") # obtain max value of data (to be used in visualization subplots) max_data = processed_data.max() * 1.2 min_data = processed_data.min() * 1.2 timer_tick = get_time_tick(start) print("#############################################################################################") print("Data processing stage elapsed time: %s" % timer_tick) ############################################################################################# # Implementing Euclidean k-means clustering algorithm ############################################################################################# if i_args.euclidean_clustering: start = timer() print("Euclidean k-means") # define parameters of the model of the algorithm k_means_euclidean = TimeSeriesKMeans( n_clusters=i_args.number_clusters, verbose=True, random_state=seed, n_jobs=4 ) # calculate cluster's label array euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data) # draw subplots with attributed clusters of time series as well as # cluster centers' lines for i_cluster in range(i_args.number_clusters): f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, euclidean_clustered_data, 'tab:blue') f_ax_euclidean.plot( k_means_euclidean.cluster_centers_[i_cluster].ravel(), "tab:green" ) if i_cluster == 1: middle_axis = f_ax_euclidean # increment count of filled layer of subplots count_layer += 1 # obtain processing time, print it to console and # add it to the title of the series of subplots timer_tick = get_time_tick(start) middle_axis.set_title( "Euclidean $k$-means (%s)" % timer_tick, color='tab:green', y=1, pad=14 ) print("#############################################################################################") print("Euclidean k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing DTW k-means clustering algorithm # use dtw (Dynamic Time Warping Distance) metric to calculate # distance between means ############################################################################################# if i_args.dtw_clustering: start = timer() print("DTW k-means") k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters, n_init=3, metric="dtw", verbose=True, max_iter_barycenter=10, random_state=seed, n_jobs=6 ) dtw_clustered_data = k_means_DTW.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, dtw_clustered_data, 'tab:blue') f_ax_dtw.plot( k_means_DTW.cluster_centers_[i_cluster].ravel(), "tab:red" ) if i_cluster == 1: middle_axis = f_ax_dtw # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "DTW $k$-means (%s)" % timer_tick, color='tab:red', y=1, pad=14 ) print("#############################################################################################") print("DTW k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing soft DTW k-means clustering algorithm # use soft dtw (Dynamic Time Warping Distance) metric to calculate # distance between means ############################################################################################# if i_args.soft_dtw_clustering: start = timer() print("Soft-DTW k-means") k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters, metric="softdtw", metric_params={"gamma": .025}, verbose=True, random_state=seed, n_jobs=6 ) soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, soft_dtw_clustered_data, 'tab:blue') f_ax_soft_dtw.plot( k_means_soft_DTW.cluster_centers_[i_cluster].ravel(), "tab:purple" ) if i_cluster == 1: middle_axis = f_ax_soft_dtw # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "Soft-DTW $k$-means (%s)" % timer_tick, color='tab:purple', y=1, pad=14 ) print("#############################################################################################") print("Soft-DTW k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing k-Shape clustering algorithm ############################################################################################# if i_args.k_shape_clustering: start = timer() print("K-Shape") k_shape = KShape(n_clusters=i_args.number_clusters, verbose=True, random_state=seed ) k_shape_clustered_data = k_shape.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min()) max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max()) f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_axe_value, max_axe_value, processed_data, k_shape_clustered_data, 'tab:blue') f_ax_k_shape.plot( k_shape.cluster_centers_[i_cluster].ravel(), "tab:orange" ) if i_cluster == 1: middle_axis = f_ax_k_shape # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "$K$-Shape (%s)" % timer_tick, color='tab:orange', y=1, pad=14 ) print("#############################################################################################") print("K-Shape time processing: %s" % timer_tick) ############################################################################################# # Implementing Global Alignment kernel k-means clustering algorithm # since kernel is used, there is no centroid of the cluster ############################################################################################# if i_args.gak_clustering: start = timer() print("GAK-k-means") gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters, kernel="gak", kernel_params={"sigma": "auto"}, n_init=10, verbose=True, random_state=seed, n_jobs=6 ) gak_clustered_data = gak_k_means.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, gak_clustered_data, 'tab:blue') if i_cluster == 1: middle_axis = f_ax_gak_k_means # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "Global Alignment kernel $k$-means (%s)" % timer_tick, color='tab:cyan', y=1, pad=14) print("#############################################################################################") print("GAK k-means time processing: %s" % timer_tick) ############################################################################################# # return string with current datetime now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") # define the name of the directory to be created path = "./out/%s" % now print("#############################################################################################") try: os.mkdir(path) except OSError: print("Creation of the directory %s failed" % path) else: print("Successfully created the directory %s " % path) try: # save figure as pdf to out folder fig.savefig("./out/%s/visual_result.pdf" % now) # save clustering results if i_args.euclidean_clustering: numpy.savetxt( "./out/%s/euclidean_clustering_result.csv" % now, euclidean_clustered_data, delimiter="," ) if i_args.dtw_clustering: numpy.savetxt( "./out/%s/dtw_clustering_result.csv" % now, dtw_clustered_data, delimiter="," ) if i_args.soft_dtw_clustering: numpy.savetxt( "./out/%s/soft_dtw_clustering_result.csv" % now, soft_dtw_clustered_data, delimiter="," ) if i_args.k_shape_clustering: numpy.savetxt( "./out/%s/k_shape_clustering_result.csv" % now, k_shape_clustered_data, delimiter="," ) if i_args.gak_clustering: numpy.savetxt( "./out/%s/gak_clustering_result.csv" % now, gak_clustered_data, delimiter="," ) except RuntimeError: print("Saving results failed") else: print("Successfully saved results in the path %s " % path) ############################################################################################# # obtain and print global executing time timer_tick = get_time_tick(start_global) print("#############################################################################################") print("All algorithms elapsed time: % s" % timer_tick) ############################################################################################# # render and show plot # plt.show() plt.draw() plt.pause(0.001) input("Press [enter] to finish.") print("#############################################################################################")
# In[111]: data = np.reshape(np.nan_to_num(users), (-1, len(starttimes), 1)) seed = 0 np.random.seed(seed) sz = data.shape[1] output = pd.DataFrame() output['cellname'] = full_cells[0:1000] for cluster_number in cluster_numbers: print(cluster_number) ks = KShape(n_clusters=cluster_number, verbose=True, random_state=seed) y_pred = ks.fit_predict(data) output[cluster_number] = y_pred # In[ ]: try: output = pd.read_csv( "/vha/home/61072380/seca_hh_clusters_20190930_200800.csv") output = output.drop(output.columns[0], axis=1) except: print("No scaled data") cluster_counts = pd.DataFrame() cluster_counts['index'] = cluster_numbers cluster_counts = cluster_counts.set_index('index', drop=True) cluster_medians = pd.DataFrame()