def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray: """Fit time series clusterer to training data. Parameters ---------- X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape (n_instances, n_dimensions, series_length)) Training time series instances to cluster. y: ignored, exists for API consistency reasons. Returns ------- self: Fitted estimator. """ from tslearn.clustering import KShape if self._tslearn_k_shapes is None: self._tslearn_k_shapes = KShape( # n_clusters=self.n_clusters, n_clusters=3, max_iter=self.max_iter, tol=self.tol, random_state=self.random_state, n_init=self.n_init, verbose=self.verbose, init=self.init_algorithm, ) self._tslearn_k_shapes.fit(X) self._cluster_centers = self._tslearn_k_shapes.cluster_centers_ self.labels_ = self._tslearn_k_shapes.labels_ self.inertia_ = self._tslearn_k_shapes.inertia_ self.n_iter_ = self._tslearn_k_shapes.n_iter_
def k_init(self, v=True): """ initialisation de l'instance de l'algorithm avec les parametres actuels Parameters: * v: boolean Verbose, affiche les info lie au partitionnement Returns: NA """ self.km = KShape(n_clusters=self.n, verbose=v, random_state=self.seed)
def test_serialize_kshape(): n, sz, d = 15, 10, 3 rng = numpy.random.RandomState(0) time_series = rng.randn(n, sz, d) X = TimeSeriesScalerMeanVariance().fit_transform(time_series) ks = KShape(n_clusters=3, verbose=True) _check_not_fitted(ks) ks.fit(X) _check_params_predict(ks, X, ['predict'])
def cal_k_shape(self, data, num_cluster): """ use best of cluster :param df: time series dataset :param num_cluster: :return:cluster label """ ks = KShape(n_clusters=num_cluster, n_init=5, verbose=True, random_state=self.seed) y_pred = ks.fit_predict(data) return y_pred
def __init__( self, time_span=1, batch=60, data=None, ): self.time_span = time_span * 6 self.data = data self.batch = batch self.km = KShape(n_clusters=2, max_iter=50, verbose=True, random_state=0)
def test_kshape(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series) ks = KShape(n_clusters=3, n_init=1, verbose=False, random_state=rng).fit(time_series) dists = ks._cross_dists(time_series) np.testing.assert_allclose(ks.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(ks.labels_, ks.predict(time_series)) assert KShape(n_clusters=101, verbose=False, random_state=rng).fit(time_series)._X_fit is None
def do_kshape(days, km_size): """ From a time series (as a list of df called days), creates km_size clusters using kshape algo. """ # Arrange data for our lib unq = days["n_day_"].unique() values = [days[days["n_day_"] == l]["val_"].values for l in unq] formatted_dataset = to_time_series_dataset(values) # Configure our kmeans kshape = KShape(n_clusters=km_size, random_state=42, verbose=False) y_pred = kshape.fit_predict(formatted_dataset) return kshape, y_pred
def clustering_Kshape(tsdata, n_clusters, random_state, n_init, max_iter=100): np.random.seed(random_state) # Need to be normalized to calculate cross correlation # stack_data = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(stack_data) # Instantiate of KShape Class ks = KShape( n_clusters=n_clusters, n_init=n_init, verbose=True, random_state=random_state, max_iter=max_iter ) y_pred = ks.fit_predict(tsdata) return y_pred
def _create_model(self, model): """ A messy function until the latest tslearn is out with a fix for array-based hyper-parameters """ # Convert the lists back to arrays for k in model['model_params'].keys(): param = model['model_params'][k] if type(param) is list: arr = np.array(param) if arr.dtype == object: # Then maybe it was rather a list of arrays # This is very hacky... arr = [np.array(p) for p in param] model['model_params'][k] = arr for k in model['hyper_params'].keys(): param = model['hyper_params'][k] if type(param) is list: arr = np.array(param) if arr.dtype == object: # Then maybe it was rather a list of arrays # This is very hacky... arr = [np.array(p) for p in param] model['hyper_params'][k] = arr hyper_params = model['hyper_params'] model_params = model['model_params'] inst = KShape(**hyper_params) for p in model_params.keys(): setattr(inst, p, model_params[p]) return inst
def plot_elbow(self, data): """ :param df:multi time series type is np.array :return: elbow plot """ distortions = [] for i in range(2, 7): ks = KShape(n_clusters=i, n_init=5, verbose=True, random_state=self.seed) ks.fit(data) distortions.append(ks.inertia_) plt.plot(range(2, 7), distortions, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion Line') plt.show()
def single_clustering(self, data_raw, data_new, centroid_num, model): """ 单次聚类 :return: """ seed = 0 np.random.seed(seed) labels = [] inertia = [] centers = [] if model == 'K-Means': kmeans = KMeans(n_clusters=centroid_num).fit(data_new) labels = kmeans.labels_ centers = kmeans.cluster_centers_ # 每个点到其簇的质心的距离之和,越小越好 inertia = kmeans.inertia_ elif model == 'DTW': sdtw_km = TimeSeriesKMeans(n_clusters=centroid_num, metric='softdtw', max_iter=2, max_iter_barycenter=2, metric_params={ "gamma": 1.0 }, random_state=0, verbose=True).fit(data_new) labels = sdtw_km.labels_ centers = sdtw_km.cluster_centers_ inertia = sdtw_km.inertia_ elif model == "K-Shape": ks = KShape(n_clusters=centroid_num, verbose=True, random_state=seed).fit(data_new) labels = ks.labels_ centers = ks.cluster_centers_ inertia = ks.inertia_ elif model == "Kernel-KMeans": data_new = data_new[:100] data_raw = data_raw[:100] kk = KernelKMeans(n_clusters=centroid_num, kernel="gak", kernel_params={ "sigma": "auto" }, max_iter=2, tol=1e-4, verbose=True).fit(data_new) labels = kk.labels_ inertia = kk.inertia_ D = {} for i in range(centroid_num): D[i] = [] for i in range(len(data_raw)): D[labels[i]].append(data_raw[i]) return inertia, D, centers
def kshape(container: DataFrameContainer, data_column: str, n_clusters: int, max_iter: int = 300, tol: float = 1e-6, n_init: int = 1, verbose: bool = True, random_state: Union[int, None] = None, init: np.ndarray = 'random', centroid_seeds: np.ndarray = None): """ :param container: :param data_column: :param n_clusters: :param max_iter: :param tol: :param n_init: :param verbose: :param random_state: :param init: :param centroid_seeds: arrays of shape [n_clusters, ts_size] :return: """ if centroid_seeds is not None: init = np.swapaxes(np.array([centroid_seeds]).T, 0, 1) ks = KShape(n_clusters=n_clusters, max_iter=max_iter, tol=tol, n_init=n_init, verbose=verbose, random_state=random_state, init=init) X = np.vstack(container.dataframe[data_column].values) y = ks.fit_predict(X) container.dataframe['KSHAPE_CLUSTER'] = y return container
def kshape_grid_iter(X_partitioned: List[np.array], kshape_kwargs: dict) -> Tuple[KShape, int]: seed_ixs = [np.random.randint(0, X.shape[0] - 1) for X in X_partitioned] centroid_seeds = np.array( [X_partitioned[i][seed] for i, seed in enumerate(seed_ixs)]) init = np.swapaxes(np.array([centroid_seeds]).T, 0, 1) kshape = KShape(n_clusters=len(seed_ixs), init=init, verbose=True, random_state=None, **kshape_kwargs) X = np.vstack(X_partitioned) print('** Fitting ks model **') kshape.fit(X) print('** Predicting **') n_clusters_out = np.unique(kshape.predict(X)).size # until the tslearn hyper-param json issue is released in the latest pypi version kshape.init = kshape.init.tolist() return kshape, n_clusters_out
def plot_best_shape(self, data, num_cluster): """ time series cluster plot :param df: :param num_cluster: :return: """ ks = KShape(n_clusters=num_cluster, n_init=5, verbose=True, random_state=self.seed) y_pred = ks.fit_predict(data) for yi in range(num_cluster): for xx in data[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.3) plt.plot(ks.cluster_centers_[yi].ravel(), "r-") plt.text(0.55, 0.85, 'Cluster %d' % (yi + 1), transform=plt.gca().transAxes) plt.tight_layout() plt.show()
def run_single(X, train, params, workdir, out): kwargs = params ks = KShape(**kwargs) ks.fit(train) print('**** Predicting ****') y_pred = ks.predict(X) ks_path = os.path.join(workdir, 'ks.pickle') pickle.dump(ks, open(ks_path, 'wb')) y_pred_path = os.path.join(workdir, 'y_pred.npy') np.save(y_pred_path, y_pred) train_path = os.path.join(workdir, 'train.npy') np.save(train_path, train) with open(out, 'w') as f: f.write('1') print('* Done! *')
def run(data_path: str, params_path: str): X = np.load(data_path) params = pickle.load(open(params_path, 'rb')) workdir = params['workdir'] out = os.path.join(workdir, 'out') with open(out, 'w') as f: f.write('0') print(f'Using work dir: {workdir}') print('** Fitting training data **') n_train = int((params['kwargs'].pop('train_percent') / 100) * X.shape[0]) train = X[np.random.choice(X.shape[0], size=n_train, replace=False)] kwargs = params['kwargs'] ks = KShape(**kwargs) ks.fit(train) print('**** Predicting ****') y_pred = ks.predict(X) ks_path = os.path.join(workdir, 'ks.pickle') pickle.dump(ks, open(ks_path, 'wb')) y_pred_path = os.path.join(workdir, 'y_pred.npy') np.save(y_pred_path, y_pred) train_path = os.path.join(workdir, 'train.npy') np.save(train_path, train) with open(out, 'w') as f: f.write('1') print('* Done! *')
def test_serialize_kshape(): n, sz, d = 15, 10, 3 rng = numpy.random.RandomState(0) time_series = rng.randn(n, sz, d) X = TimeSeriesScalerMeanVariance().fit_transform(time_series) ks = KShape(n_clusters=3, verbose=True) _check_not_fitted(ks) ks.fit(X) _check_params_predict(ks, X, ['predict']) seed_ixs = [numpy.random.randint(0, X.shape[0] - 1) for i in range(3)] seeds = numpy.array([X[i] for i in seed_ixs]) ks_seeded = KShape(n_clusters=3, verbose=True, init=seeds) _check_not_fitted(ks_seeded) ks_seeded.fit(X) _check_params_predict(ks_seeded, X, ['predict'])
def __init__( self, num_clusters: int, clustering_method: str, kmeans_metric: str, max_iter: int, ): super().__init__() self.num_clusters = num_clusters self.kmeans_metric = kmeans_metric self.clustering_method = clustering_method self.max_iter = max_iter self.test_metric = purity self.final_labels = None if self.clustering_method == "kshape": self.cluster_model = KShape(n_clusters=self.num_clusters, max_iter=self.max_iter) elif self.clustering_method == "kmeans": self.cluster_model = TimeSeriesKMeans( n_clusters=self.num_clusters, metric=self.kmeans_metric, max_iter=self.max_iter, )
formatted_dataset = to_time_series_dataset(pivoted_series) print("Data shape: {}".format(formatted_dataset.shape)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) sz = formatted_norm_dataset.shape[1] print("Data shape: {}".format(sz)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) totalColumn = formatted_norm_dataset.shape[0] totalRow = formatted_norm_dataset.shape[1] clusters = 5 ks = KShape(n_clusters=clusters, verbose=True, random_state=seed) y_pred_ks = ks.fit_predict(formatted_norm_dataset) formatted_norm_dataset.shape data = formatted_norm_dataset data.shape formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0] formatted_norm_dataset_2d.shape #pd.DataFrame(A.T.reshape(2, -1), columns=cols) df_normalized = pd.DataFrame(formatted_norm_dataset_2d) df_normalized #df_normalized = df_normalized.pivot() # formatted_norm_dataset[0] df_cluster = pd.DataFrame(y_pred_ks,
def k_shape(X_train, n_clusters, verbose=True, seed=0): # Euclidean k-means ks = KShape(n_clusters=n_clusters, verbose=verbose, random_state=seed) return ks, ks.fit_predict(X_train)
def train(self, X_train, save_memory=False): """ The training phase of the EUDTR model。 Args: X_train: Train dataset save_memory: If True returns path to train data, otherwise to test data """ train_torch_dataset = IndexedDatase(X_train, numpy.array(list(range(X_train.shape[0])))) train_generator = torch.utils.data.DataLoader(train_torch_dataset, batch_size=self.batch_size, shuffle=True) X_train = X_train.swapaxes(1,2) ks = KShape(n_clusters=self.n_clusters).fit(X_train) labels = ks.labels_ X_train = X_train.swapaxes(1,2) sc_score = -1 label2index = list2dict(labels) for epoch in range(self.epochs): epoch_start = time.time() self.encoder = self.encoder.train() self.encoder = self.encoder.train() for batch_num, batch in enumerate(train_generator): loss = 0 indices, data = batch pos_samples, neg_samples = pos_neg_sampling(X_train, labels, label2index, indices, data, self.nb_random_samples) pos_samples = torch.from_numpy(pos_samples) neg_samples = torch.from_numpy(neg_samples).permute(1, 0, 2, 3) data = data.to(self.device) pos_samples = pos_samples.to(self.device) neg_samples = neg_samples.to(self.device) self.encoder_optimizer.zero_grad() self.decoder_optimizer.zero_grad() ref_embedding = self.encoder(data) pos_i_embedding = self.encoder(pos_samples) # Calculate the PN-Triplet loss and backward loss = -torch.mean(torch.nn.functional.logsigmoid(torch.bmm( ref_embedding.view(data.shape[0], 1, self.out_channels), pos_i_embedding.view(data.shape[0], self.out_channels, 1) ))) if save_memory: loss.backward(retain_graph=True) loss = 0 del pos_i_embedding torch.cuda.empty_cache() multiplicative_ratio = self.negative_penalty / self.nb_random_samples for i in range(self.nb_random_samples): neg_i_embedding = self.encoder(neg_samples[i]) loss += multiplicative_ratio * -torch.mean(torch.nn.functional.logsigmoid(-torch.bmm( ref_embedding.view(data.shape[0], 1, self.out_channels), neg_i_embedding.view(data.shape[0], self.out_channels, 1) ))) if save_memory: loss.backward(retain_graph=True) loss = 0 del neg_i_embedding torch.cuda.empty_cache() # Calculate the MI loss and backward self.mi_loss(data, neg_samples, self.encoder, self.decoder, save_memory) self.encoder_optimizer.step() self.decoder_optimizer.step() epoch_end = time.time() print('Train--Epoch: ', epoch + 1, " time: ", epoch_end - epoch_start) features = self.encode(X_train, self.batch_size) ''' In order to speed up the convergence of the contour coefficients, it is judged that the contour coefficients are not updated for 3 consecutive times. ''' consecutive_failures = 0 while True: km = KMeans(n_clusters=self.n_clusters).fit(features.reshape(features.shape[0], -1)) temp_score = silhouette_score(features.reshape(features.shape[0], -1), km.labels_) if temp_score > sc_score: consecutive_failures = 0 sc_score = temp_score print('sc_score changed:',sc_score) labels = km.labels_ label2index = list2dict(labels) else: consecutive_failures = consecutive_failures + 1 if consecutive_failures == 3: break
from tslearn.clustering import KShape from tslearn.datasets import CachedDatasets from tslearn.preprocessing import TimeSeriesScalerMeanVariance seed = 0 numpy.random.seed(seed) X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace") X_train = X_train[y_train < 4] # Keep first 3 classes numpy.random.shuffle(X_train) X_train = TimeSeriesScalerMeanVariance().fit_transform( X_train[:50]) # Keep only 50 time series sz = X_train.shape[1] # Euclidean k-means ks = KShape(n_clusters=3, verbose=True, random_state=seed) y_pred = ks.fit_predict(X_train) plt.figure() for yi in range(3): plt.subplot(3, 1, 1 + yi) for xx in X_train[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(ks.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) plt.ylim(-4, 4) plt.title("Cluster %d" % (yi + 1)) plt.tight_layout() plt.show()
(c + 1)] = input_waves[ i, (snipLen * fs + 1) * c:(snipLen * fs + 1) * (c + 1)] / max( np.abs(input_waves[i, (snipLen * fs + 1) * c:(snipLen * fs + 1) * (c + 1)])) else: for i in range(len(input_waves)): input_waves[i, :] = input_waves[i, :] / max(np.abs(input_waves[i, :])) # save result if skipClustering == 0: # run clustering print("Clustering...") ks = KShape(n_clusters=numCluster, n_init=1, random_state=0).fit(input_waves) pred = ks.fit_predict(input_waves) clustFile = h5py.File( templatePath + str(numCluster) + "/" + str(numCluster) + "_cluster_predictions_" + str(prefiltFreq[0]) + "-" + str(prefiltFreq[1]) + "Hz.h5", "w") clustFile.create_dataset("cluster_index", data=pred) clustFile.create_dataset("centroids", data=ks.cluster_centers_) clustFile.create_dataset("inertia", data=ks.inertia_) clustFile.close() # load some variables centroids = ks.cluster_centers_ if skipClustering:
y_pred = km.fit_predict(X) print(y_pred) elif args.kmeans_algo == 2: k_title = "Soft-DTW k-means" f_title = "soft_DTW" km = TimeSeriesKMeans(n_clusters=num, metric="softdtw", metric_params={"gamma_sdtw": .01}, verbose=True, random_state=seed) y_pred = km.fit_predict(X) print(y_pred) else: k_title = "KShape" f_title = "KShape" km = KShape(n_clusters=num, verbose=True, random_state=seed) y_pred = km.fit_predict(X) print(y_pred) plt.figure() for yi in range(num): plt.subplot(num, 1, yi + 1) for xx in X[y_pred == yi]: plt.plot(xx.ravel(), "k-", alpha=.2) plt.plot(km.cluster_centers_[yi].ravel(), "r-") plt.xlim(0, sz) if y_min < 100 and y_max > -100: plt.ylim(y_min - 1, y_max + 1) if yi == 0: plt.title(k_title) if args.anon:
def main(argv): # define global timer to obtain global execution time start_global = timer() # define globals variables global euclidean_clustered_data, \ dtw_clustered_data, \ soft_dtw_clustered_data, \ k_shape_clustered_data, \ gak_clustered_data ############################################################################################# # Input arguments parsing ############################################################################################# # define help message help_message = \ 'clustering.py -h \n\n' \ 'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \ 'by default: processing input data (without any sampling)' \ '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \ 'options list: \n' \ ' -c / --clusters <number_clusters> # set number of clusters (default 3) \n\n' \ ' -i / --ifile <input_file> # set input filename \n' \ ' -n / --normalise # normalise input data \n' \ ' -s / --standardise # standardise input data \n\n' \ ' -a / --all # perform all 5 implemented methods of clustering: \n' \ ' euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \ ' -E / --euclidean # perform euclidean k-means clustering \n' \ ' -D / --dtw # perform dtw k-means clustering \n' \ ' -S / --soft-dtw # perform soft-dtw k-means clustering \n' \ ' -K / --k-shape # perform k-shape clustering \n' \ ' -G / --gak # perform GAK k-means clustering \n' # Create new object to save arguments i_args = Arguments() # number of rows in plot to create correct number of subplots # default = 3 (raw data plus distribution histograms) n_rows_plot = 3 # define validation rules for arguments try: opts, args = getopt.getopt( argv, "hc:i:nsaEDSKG", [ "help", "clusters=", "ifile=", "normalise", "standardise", "all", "euclidean", "dtw", "soft-dtw", "k-shape", "gak" ] ) except getopt.GetoptError: print(help_message) sys.exit(2) # parse arguments for opt, arg in opts: if opt in ("-h", "--help"): print(help_message) sys.exit() elif opt in ("-c", "--clusters"): i_args.number_clusters = arg elif opt in ("-i", "--ifile"): i_args.input_file = arg elif opt in ("-n", "--normalise"): i_args.normalise_data = True elif opt in ("-s", "--standardise"): i_args.standardise_data = True elif opt in ("-E", "--euclidean"): n_rows_plot += 1 i_args.euclidean_clustering = True elif opt in ("-D", "--dtw"): n_rows_plot += 1 i_args.dtw_clustering = True elif opt in ("-S", "--soft-dtw"): n_rows_plot += 1 i_args.soft_dtw_clustering = True elif opt in ("-K", "--k-shape"): n_rows_plot += 1 i_args.k_shape_clustering = True elif opt in ("-G", "--gak"): n_rows_plot += 1 i_args.gak_clustering = True elif opt in ("-a", "--all"): n_rows_plot = 8 i_args.euclidean_clustering = True i_args.dtw_clustering = True i_args.soft_dtw_clustering = True i_args.k_shape_clustering = True i_args.gak_clustering = True # normalise maximum number of subplots levels n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot ############################################################################################# # Raw data processing stage ############################################################################################# # set style to matplotlib plot mpl.style.use('seaborn') # set seed value and seed the generator seed = 0 numpy.random.seed(seed) # import data and print first 5 rows raw_data = import_data() print(raw_data.head()) # convert raw data to the format which can be used by tslearn # (3-d dimensional array) # BUILT functionality: adjust all time series to one size # (NaN values are appended to the shorter ones) formatted_data = to_time_series_dataset(raw_data) # print shape of new array print(formatted_data.shape) # obtain number of measuring n_measuring = formatted_data.shape[1] # define figure, grid_spec to create layout of the plot fig = plt.figure(constrained_layout=True) grid_spec = fig.add_gridspec( n_rows_plot, i_args.number_clusters ) # set A4 size to figure fig.set_size_inches(8.5, 11.75) # setup count of layers of subplots count_layer = 3 # setup first subplot and draw raw time series f_ax_raw_data = fig.add_subplot(grid_spec[:2, :]) for xx in formatted_data: f_ax_raw_data.plot(xx.ravel(), alpha=.2) formatted_data_min = formatted_data.min() formatted_data_max = formatted_data.max() # draw title for chart with min and max values f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max)) # obtain and print executing time of data processing stage to console, timer_tick = get_time_tick(start_global) plt.ion() plt.show() print("Raw data processing time: %s" % timer_tick) ############################################################################################# # Data preprocessing stage ############################################################################################# start = timer() # Convert NaNs to value predicted by interpolation # linearly interpolate for NaN/NaNs n_nan_changes = 0 for ind in range(formatted_data.shape[0]): mask = numpy.isnan(formatted_data[ind]) n_nan_changes += mask.sum() formatted_data[ind][mask] = numpy.interp( numpy.flatnonzero(mask), numpy.flatnonzero(~mask), formatted_data[ind][~mask] ) print("%d NaN values was/were interpolated" % n_nan_changes) # Scaling # to know should we use normalization or standardization, we need to see # the distribution of values. # take random 3 measuring for each case to draw histograms random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False) # create new arrays with values of randomly chosen measurements histogram_data = formatted_data[:, random_indexes] # draw histograms for i_histogram in range(i_args.number_clusters): f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram]) f_ax_histogram.hist( histogram_data[:, i_histogram], bins=25, density=True ) f_ax_histogram.text(0.55, 0.98, 'Measurement #%d' % random_indexes[i_histogram], transform=plt.gca().transAxes, color="navy" ) if i_histogram == 1: preprocessing = '' if i_args.normalise_data: preprocessing += "normalised" if i_args.standardise_data: preprocessing += " and standardised" elif i_args.standardise_data: preprocessing += "standardised" preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing f_ax_histogram.set_title( "Distributions histograms %s" % preprocessing, color='navy', y=1, pad=14 ) # if no processing data option chosen continue with raw data processed_data = formatted_data # since for this concrete challenge data the distributions are more/less # Gaussian/Normal we can use standardization # normalize data: Min-Max scaling ranging between 0 and 1 if i_args.normalise_data: processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data) print("Data was normalised") # standardize data: scaling technique where the values are centered around # the mean with a unit standard deviation if i_args.standardise_data: processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data) print("Data was standardised") # obtain max value of data (to be used in visualization subplots) max_data = processed_data.max() * 1.2 min_data = processed_data.min() * 1.2 timer_tick = get_time_tick(start) print("#############################################################################################") print("Data processing stage elapsed time: %s" % timer_tick) ############################################################################################# # Implementing Euclidean k-means clustering algorithm ############################################################################################# if i_args.euclidean_clustering: start = timer() print("Euclidean k-means") # define parameters of the model of the algorithm k_means_euclidean = TimeSeriesKMeans( n_clusters=i_args.number_clusters, verbose=True, random_state=seed, n_jobs=4 ) # calculate cluster's label array euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data) # draw subplots with attributed clusters of time series as well as # cluster centers' lines for i_cluster in range(i_args.number_clusters): f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, euclidean_clustered_data, 'tab:blue') f_ax_euclidean.plot( k_means_euclidean.cluster_centers_[i_cluster].ravel(), "tab:green" ) if i_cluster == 1: middle_axis = f_ax_euclidean # increment count of filled layer of subplots count_layer += 1 # obtain processing time, print it to console and # add it to the title of the series of subplots timer_tick = get_time_tick(start) middle_axis.set_title( "Euclidean $k$-means (%s)" % timer_tick, color='tab:green', y=1, pad=14 ) print("#############################################################################################") print("Euclidean k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing DTW k-means clustering algorithm # use dtw (Dynamic Time Warping Distance) metric to calculate # distance between means ############################################################################################# if i_args.dtw_clustering: start = timer() print("DTW k-means") k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters, n_init=3, metric="dtw", verbose=True, max_iter_barycenter=10, random_state=seed, n_jobs=6 ) dtw_clustered_data = k_means_DTW.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, dtw_clustered_data, 'tab:blue') f_ax_dtw.plot( k_means_DTW.cluster_centers_[i_cluster].ravel(), "tab:red" ) if i_cluster == 1: middle_axis = f_ax_dtw # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "DTW $k$-means (%s)" % timer_tick, color='tab:red', y=1, pad=14 ) print("#############################################################################################") print("DTW k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing soft DTW k-means clustering algorithm # use soft dtw (Dynamic Time Warping Distance) metric to calculate # distance between means ############################################################################################# if i_args.soft_dtw_clustering: start = timer() print("Soft-DTW k-means") k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters, metric="softdtw", metric_params={"gamma": .025}, verbose=True, random_state=seed, n_jobs=6 ) soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, soft_dtw_clustered_data, 'tab:blue') f_ax_soft_dtw.plot( k_means_soft_DTW.cluster_centers_[i_cluster].ravel(), "tab:purple" ) if i_cluster == 1: middle_axis = f_ax_soft_dtw # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "Soft-DTW $k$-means (%s)" % timer_tick, color='tab:purple', y=1, pad=14 ) print("#############################################################################################") print("Soft-DTW k-means time processing: %s" % timer_tick) ############################################################################################# # Implementing k-Shape clustering algorithm ############################################################################################# if i_args.k_shape_clustering: start = timer() print("K-Shape") k_shape = KShape(n_clusters=i_args.number_clusters, verbose=True, random_state=seed ) k_shape_clustered_data = k_shape.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min()) max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max()) f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_axe_value, max_axe_value, processed_data, k_shape_clustered_data, 'tab:blue') f_ax_k_shape.plot( k_shape.cluster_centers_[i_cluster].ravel(), "tab:orange" ) if i_cluster == 1: middle_axis = f_ax_k_shape # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "$K$-Shape (%s)" % timer_tick, color='tab:orange', y=1, pad=14 ) print("#############################################################################################") print("K-Shape time processing: %s" % timer_tick) ############################################################################################# # Implementing Global Alignment kernel k-means clustering algorithm # since kernel is used, there is no centroid of the cluster ############################################################################################# if i_args.gak_clustering: start = timer() print("GAK-k-means") gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters, kernel="gak", kernel_params={"sigma": "auto"}, n_init=10, verbose=True, random_state=seed, n_jobs=6 ) gak_clustered_data = gak_k_means.fit_predict(processed_data) for i_cluster in range(i_args.number_clusters): f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster, n_measuring, min_data, max_data, processed_data, gak_clustered_data, 'tab:blue') if i_cluster == 1: middle_axis = f_ax_gak_k_means # increment count of filled layer of subplots count_layer += 1 timer_tick = get_time_tick(start) middle_axis.set_title( "Global Alignment kernel $k$-means (%s)" % timer_tick, color='tab:cyan', y=1, pad=14) print("#############################################################################################") print("GAK k-means time processing: %s" % timer_tick) ############################################################################################# # return string with current datetime now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S") # define the name of the directory to be created path = "./out/%s" % now print("#############################################################################################") try: os.mkdir(path) except OSError: print("Creation of the directory %s failed" % path) else: print("Successfully created the directory %s " % path) try: # save figure as pdf to out folder fig.savefig("./out/%s/visual_result.pdf" % now) # save clustering results if i_args.euclidean_clustering: numpy.savetxt( "./out/%s/euclidean_clustering_result.csv" % now, euclidean_clustered_data, delimiter="," ) if i_args.dtw_clustering: numpy.savetxt( "./out/%s/dtw_clustering_result.csv" % now, dtw_clustered_data, delimiter="," ) if i_args.soft_dtw_clustering: numpy.savetxt( "./out/%s/soft_dtw_clustering_result.csv" % now, soft_dtw_clustered_data, delimiter="," ) if i_args.k_shape_clustering: numpy.savetxt( "./out/%s/k_shape_clustering_result.csv" % now, k_shape_clustered_data, delimiter="," ) if i_args.gak_clustering: numpy.savetxt( "./out/%s/gak_clustering_result.csv" % now, gak_clustered_data, delimiter="," ) except RuntimeError: print("Saving results failed") else: print("Successfully saved results in the path %s " % path) ############################################################################################# # obtain and print global executing time timer_tick = get_time_tick(start_global) print("#############################################################################################") print("All algorithms elapsed time: % s" % timer_tick) ############################################################################################# # render and show plot # plt.show() plt.draw() plt.pause(0.001) input("Press [enter] to finish.") print("#############################################################################################")
random_state=2019) X_train = to_time_series_dataset(data_train[:, 1:]) y_train = data_train[:, 0].astype(np.int) X_test = to_time_series_dataset(data_test[:, 1:]) y_test = data_test[:, 0].astype(np.int) file_name = "教師なし教科書\\13章-時系列クラスタリング\\4_ECG5000_k_shape\\result\\" # Prepare the data - Scale X_train = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_train) X_test = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_test) # Train using k-Shape ks = KShape(n_clusters=5, max_iter=100, n_init=10, verbose=1, random_state=2019) ks.fit(X_train) with open(file_name + 'result.txt', 'w') as f: # Predict on train set and calculate adjusted Rand index preds = ks.predict(X_train) ars = adjusted_rand_score(data_train[:, 0], preds) print("Adjusted Rand Index on Training Set:", ars, file=f) preds_test = ks.predict(X_test) ars = adjusted_rand_score(data_test[:, 0], preds_test) print("Adjusted Rand Index on Test Set:", ars, file=f) # Evaluate goodness of the clusters preds_test = preds_test.reshape(1000, 1)
y_train = data_train[:, 0].astype(np.int) data_test = np.loadtxt(current_path + file + "ECGFiveDays\\ECGFiveDays_TEST.tsv") X_test = to_time_series_dataset(data_test[:, 1:]) y_test = data_test[:, 0].astype(np.int) file = "教師なし教科書\\13章-時系列クラスタリング\\3_ECGFiveDays_k_shape\\result\\" # Prepare the data - Scale X_train = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_train) X_test = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_test) # k-Shape Algorithm # Train using k-Shape ks = KShape(n_clusters=2, max_iter=100, n_init=100, verbose=0) ks.fit(X_train) # Make predictions on train set and calculate adjusted Rand index preds = ks.predict(X_train) ars = adjusted_rand_score(data_train[:, 0], preds) print("Adjusted Rand Index:", ars) # Make predictions on test set and calculate adjusted Rand index preds_test = ks.predict(X_test) ars = adjusted_rand_score(data_test[:, 0], preds_test) print("Adjusted Rand Index on Test Set:", ars) # 訓練セットがちいさいから結果が悪い train 23 test 861 # Adjusted Rand Index: 0.668041237113402 # Adjusted Rand Index on Test Set: 0.012338817789874643
# scale mean around zero input_waves = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(waves) # run clustering or skip and load results if desired if skipClustering: clustFile = h5py.File( templatePath + str(numCluster) + "/" + str(numCluster) + "_cluster_predictions_" + str(prefiltFreq[0]) + "-" + str(prefiltFreq[1]) + "Hz.h5", "r") pred = np.array(list(clustFile["cluster_index"])) centroids = list(clustFile["centroids"]) clustFile.close() else: print("Clustering...") ks = KShape(n_clusters=numCluster, n_init=1, random_state=0) pred = ks.fit_predict(input_waves) clustFile = h5py.File( templatePath + str(numCluster) + "/" + str(numCluster) + "_cluster_predictions_" + str(prefiltFreq[0]) + "-" + str(prefiltFreq[1]) + "Hz.h5", "w") clustFile.create_dataset("cluster_index", data=pred) clustFile.create_dataset("centroids", data=ks.cluster_centers_) clustFile.create_dataset("inertia", data=ks.inertia_) clustFile.close() modelFile = templatePath + str(numCluster) + "/" + str( numCluster) + "_cluster_model_" + str(prefiltFreq[0]) + "-" + str( prefiltFreq[1]) + "Hz.h5" ks.to_hdf5(modelFile)
class TimeSeriesKShapes(BaseClusterer): """Kshape algorithm wrapper tslearns implementation. Parameters ---------- n_clusters: int, defaults = 8 The number of clusters to form as well as the number of centroids to generate. init_algorithm: str or np.ndarray, defaults = 'random' Method for initializing cluster centers. Any of the following are valid: ['random']. Or a np.ndarray of shape (n_clusters, ts_size, d) and gives the initial centers. n_init: int, defaults = 10 Number of times the k-means algorithm will be run with different centroid seeds. The final result will be the best output of n_init consecutive runs in terms of inertia. max_iter: int, defaults = 30 Maximum number of iterations of the k-means algorithm for a single run. tol: float, defaults = 1e-4 Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare convergence. verbose: bool, defaults = False Verbosity mode. random_state: int or np.random.RandomState instance or None, defaults = None Determines random number generation for centroid initialization. Attributes ---------- labels_: np.ndarray (1d array of shape (n_instance,)) Labels that is the index each time series belongs to. inertia_: float Sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided. n_iter_: int Number of iterations run. """ _tags = { "capability:multivariate": True, } def __init__( self, n_clusters: int = 8, init_algorithm: Union[str, np.ndarray] = "random", n_init: int = 10, max_iter: int = 300, tol: float = 1e-4, verbose: bool = False, random_state: Union[int, RandomState] = None, ): _check_soft_dependencies("tslearn", severity="error", object=self) self.init_algorithm = init_algorithm self.n_init = n_init self.max_iter = max_iter self.tol = tol self.verbose = verbose self.random_state = random_state self.cluster_centers_ = None self.labels_ = None self.inertia_ = None self.n_iter_ = 0 self._tslearn_k_shapes = None super(TimeSeriesKShapes, self).__init__(n_clusters=n_clusters) def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray: """Fit time series clusterer to training data. Parameters ---------- X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape (n_instances, n_dimensions, series_length)) Training time series instances to cluster. y: ignored, exists for API consistency reasons. Returns ------- self: Fitted estimator. """ from tslearn.clustering import KShape if self._tslearn_k_shapes is None: self._tslearn_k_shapes = KShape( # n_clusters=self.n_clusters, n_clusters=3, max_iter=self.max_iter, tol=self.tol, random_state=self.random_state, n_init=self.n_init, verbose=self.verbose, init=self.init_algorithm, ) self._tslearn_k_shapes.fit(X) self._cluster_centers = self._tslearn_k_shapes.cluster_centers_ self.labels_ = self._tslearn_k_shapes.labels_ self.inertia_ = self._tslearn_k_shapes.inertia_ self.n_iter_ = self._tslearn_k_shapes.n_iter_ def _predict(self, X: TimeSeriesInstances, y=None) -> np.ndarray: """Predict the closest cluster each sample in X belongs to. Parameters ---------- X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape (n_instances, n_dimensions, series_length)) Time series instances to predict their cluster indexes. y: ignored, exists for API consistency reasons. Returns ------- np.ndarray (1d array of shape (n_instances,)) Index of the cluster each time series in X belongs to. """ return self._tslearn_k_shapes.predict(X) @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. Parameters ---------- parameter_set : str, default="default" Name of the set of test parameters to return, for use in tests. If no special parameters are defined for a value, will return `"default"` set. Returns ------- params : dict or list of dict, default = {} Parameters to create testing instances of the class Each dict are parameters to construct an "interesting" test instance, i.e., `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. `create_test_instance` uses the first (or only) dictionary in `params` """ params = { "n_clusters": 2, "init_algorithm": "random", "n_init": 1, "max_iter": 1, "tol": 1e-4, "verbose": False, "random_state": 1, } return params def _score(self, X, y=None): return np.abs(self.inertia_)
def mass_upload(startDate, endDate, id_unit_usaha): print(id_unit_usaha) login = "" password = "" # engine = sqlalchemy.create_engine('mysql+pymysql://energy:energy2x5=10@localhost:3306/pgn') engine = sqlalchemy.create_engine( 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') sql = " SELECT a.IDREFPELANGGAN, a.ID_UNIT_USAHA, 1 AS FSTREAMID, DATEPART(dw, a.FDATETIME) as FDAYOFWEEK, a.FHOUR, AVG(a.FDVC) as AVG_FDVC\ FROM(SELECT IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR, SUM(FDVC) as FDVC\ FROM amr_bridge\ WHERE FDATETIME >= '" + startDate + "'\ and FDATETIME < '" + endDate + "'\ GROUP BY IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR) a\ GROUP BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR\ ORDER BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR" df = pd.read_sql_query(sql, engine) totaldf = len(df) totaldf = str(totaldf) print('total Data: ' + totaldf) # rslt_df = df.loc[df['ID_UNIT_USAHA'] == '014'] # print(startDate) # print('\nResult dataframe :\n', rslt_df) # df.to_csv('pgn_customer_cluster_v1_{}.csv'.format(id_unit_usaha), index=False) # df.to_hdf("amr_bridge_22122020.hdf", key='hdf5') # df = pd.read_hdf("amr_bridge_22122020.hdf") def select_data(id_unit): query = "ID_UNIT_USAHA == '{}'".format(id_unit_usaha) columns = ['FDAYOFWEEK', 'FHOUR', 'IDREFPELANGGAN', 'AVG_FDVC'] # df = df.set_index('FDATETIME') df_selected = df.query(query, engine='python')[columns] return df_selected def pivot_data(df): # df_pivoted = df.pivot(index='FDATETIME', columns='IDREFPELANGGAN', values='FDVC') df_pivoted = df.pivot(index=['FDAYOFWEEK', 'FHOUR'], columns='IDREFPELANGGAN', values='AVG_FDVC') return df_pivoted def remove_zerocolumns(df): # Get all columns which have all zero values cols = df.columns[df.mean() == 0] # Drop columns which has all zero values df = df.drop(cols, axis=1) return df df_week1 = select_data(id_unit_usaha) df_week1.fillna(0.0, inplace=True) df_pivoted1 = pivot_data(df_week1) df_pivoted1.fillna(0.0, inplace=True) df_pivoted1 = remove_zerocolumns(df_pivoted1) cols = list(df_pivoted1.columns) df_pivoted1.head() # Function to plot cluster # def plot_clusters(ds, y_pred, n_clusters, ks, filename): # plt.figure(figsize=(12, 40)) # for yi in range(n_clusters): # plt.subplot(n_clusters, 1, 1 + yi) # for xx in ds[y_pred == yi]: # plt.plot(xx.ravel(), "k-", alpha=.2) # plt.plot(ks.cluster_centers_[yi].ravel(), "r-") # plt.xlim(0, sz) # plt.ylim(-7, 7) # plt.title("Cluster %d" % (yi)) # plt.tight_layout() # plt.savefig(filename, format='jpg', dpi=300, quality=95) # plt.show() def create_cluster_info(y_pred, cols): df_cluster = pd.DataFrame(y_pred.copy(), index=cols.copy(), columns=['cluster']) df_cluster.reset_index(inplace=True) df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True) unique_cluster = df_cluster['cluster'].unique() # Get ID ref based on cluster idrefs_list = [] for i, x in enumerate(unique_cluster): idref_list = df_cluster.query( "cluster == {}".format(x))['idrefpelanggan'].values.tolist() # idrefs_list[x] = idref_list # Create dictionary idref_cluster_dict = {'cluster': x, 'idrefpelanggan': idref_list} idrefs_list.append(idref_cluster_dict) idrefs_cluster = pd.DataFrame(idrefs_list) return idrefs_cluster # def run_once(startime, totalData, _has_run=[]): # if _has_run: # return # # print("run_once doing stuff") # print(startime) # endtime = time.time_ns() # print(endtime) # invTime = endtime-startime # estTime = invTime * totalData # _has_run.append(1) # print(totalData) # print(estTime) # return estTime seed = 0 np.random.seed(seed) # Convert data frame to list of series pivoted_series = [] pivoted_columns = [] for i, y in enumerate(cols): length = len(df_pivoted1[y]) cst = df_pivoted1[y].values pivoted_series.append(cst) pivoted_columns.append(y) # Convert data set to standar time series format formatted_dataset = to_time_series_dataset(pivoted_series) print("Data shape: {}".format(formatted_dataset.shape)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) sz = formatted_norm_dataset.shape[1] print("Data shape: {}".format(sz)) formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform( formatted_dataset) clusters = 5 totalColumn = formatted_norm_dataset.shape[0] totalRow = formatted_norm_dataset.shape[1] totalData = totalRow * totalColumn + totalRow * clusters ks = KShape(n_clusters=clusters, verbose=True, random_state=seed) y_pred_ks = ks.fit_predict(formatted_norm_dataset) formatted_norm_dataset.shape data = formatted_norm_dataset data.shape formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0] formatted_norm_dataset_2d.shape # pd.DataFrame(A.T.reshape(2, -1), columns=cols) df_normalized = pd.DataFrame(formatted_norm_dataset_2d) df_normalized # df_normalized = df_normalized.pivot() # formatted_norm_dataset[0] df_cluster = pd.DataFrame(y_pred_ks, index=pivoted_columns, columns=['cluster']) df_cluster.reset_index(inplace=True) df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True) df_cluster.sort_values(['cluster']) df_normalized_detail = pd.DataFrame.join(df_normalized, df_cluster) df_normalized_detail # df_cluster.to_csv('pgn_customer_cluster_{}.csv'.format( # id_unit_usaha), index=False) # Create data frame for customer and its cluster create_cluster_info(y_pred_ks, cols) # plot_clusters(formatted_norm_dataset, y_pred_ks, clusters, ks, # 'pgn_customer_cluster_{}.jpg'.format(id_unit_usaha)) # engine2 = sqlalchemy.create_engine( # 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') # Session = sessionmaker(bind=engine2) # session = Session() # Base = declarative_base() # class PL_CUSTOMER_CLUSTER(Base): # __tablename__ = 'PL_CUSTOMER_CLUSTER' # ID = Column(Integer, primary_key=True) # DATE_STAMP = Column(DateTime) # IDREFPELANGGAN = Column(String(30)) # HOUR_NUM = Column(Integer) # CLUSTER_NUM = Column(Integer) # HOUR_NUM = Column(Integer) # FDVC_NORMALIZED = Column(Float) # AREA_ID = Column(String(5)) # startime = time.time_ns() # for i in range(totalColumn): # idref = df_normalized_detail.iloc[i, totalRow] # cluster = int(df_normalized_detail.iloc[i, totalRow+1]) # print("idref = " + idref) # cluster_num = df_normalized_detail.iloc[i, totalRow-1] # for j in range(totalRow): # hour_num = df_normalized_detail.columns[j] # fdvc = df_normalized_detail.iloc[i, j] # sql = "" # # insert into table # item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=idref, # HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc, AREA_ID=id_unit_usaha) # session.add(item) # # commit per id ref pelanngan # session.commit() engine2 = sqlalchemy.create_engine( 'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server') Session = sessionmaker(bind=engine2) session = Session() Base = declarative_base() class PL_CUSTOMER_CLUSTER(Base): __tablename__ = 'PL_CUSTOMER_CLUSTER' ID = Column(Integer, primary_key=True) DATE_STAMP = Column(DateTime) IDREFPELANGGAN = Column(String(30)) HOUR_NUM = Column(Integer) CLUSTER_NUM = Column(Integer) HOUR_NUM = Column(Integer) FDVC_NORMALIZED = Column(Float) AREA_ID = Column(String(5)) df_normalized_detail for i in range(5): print("cluster: " + str(i)) CLUSTER_NAME = "CENTROID_ID" + str(i) cluster = i for j in range(totalRow): fdvc_norm = ks.cluster_centers_[i][j][0] hour_num = j sql = "" item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=CLUSTER_NAME, HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc_norm, AREA_ID=id_unit_usaha) session.add(item) print("fdvc:" + str(fdvc_norm) + "Hour:" + str(hour_num)) # commit per id ref pelanngan session.commit() print(str(j) + ", " + str(fdvc_norm)) return totalData