Exemple #1
0
    def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Fit time series clusterer to training data.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Training time series instances to cluster.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        self:
            Fitted estimator.
        """
        from tslearn.clustering import KShape

        if self._tslearn_k_shapes is None:
            self._tslearn_k_shapes = KShape(
                # n_clusters=self.n_clusters,
                n_clusters=3,
                max_iter=self.max_iter,
                tol=self.tol,
                random_state=self.random_state,
                n_init=self.n_init,
                verbose=self.verbose,
                init=self.init_algorithm,
            )

        self._tslearn_k_shapes.fit(X)
        self._cluster_centers = self._tslearn_k_shapes.cluster_centers_
        self.labels_ = self._tslearn_k_shapes.labels_
        self.inertia_ = self._tslearn_k_shapes.inertia_
        self.n_iter_ = self._tslearn_k_shapes.n_iter_
    def k_init(self, v=True):
        """
        initialisation de l'instance de l'algorithm avec les parametres actuels

        Parameters:
            * v: boolean
                Verbose, affiche les info lie au partitionnement

        Returns:
            NA
        """
        self.km = KShape(n_clusters=self.n, verbose=v, random_state=self.seed)
def test_serialize_kshape():
    n, sz, d = 15, 10, 3
    rng = numpy.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    X = TimeSeriesScalerMeanVariance().fit_transform(time_series)

    ks = KShape(n_clusters=3, verbose=True)

    _check_not_fitted(ks)

    ks.fit(X)

    _check_params_predict(ks, X, ['predict'])
Exemple #4
0
 def cal_k_shape(self, data, num_cluster):
     """
     use best of cluster
     :param df: time series dataset
     :param num_cluster:
     :return:cluster label
     """
     ks = KShape(n_clusters=num_cluster,
                 n_init=5,
                 verbose=True,
                 random_state=self.seed)
     y_pred = ks.fit_predict(data)
     return y_pred
Exemple #5
0
 def __init__(
     self,
     time_span=1,
     batch=60,
     data=None,
 ):
     self.time_span = time_span * 6
     self.data = data
     self.batch = batch
     self.km = KShape(n_clusters=2,
                      max_iter=50,
                      verbose=True,
                      random_state=0)
def test_kshape():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series)

    ks = KShape(n_clusters=3, n_init=1, verbose=False,
                random_state=rng).fit(time_series)
    dists = ks._cross_dists(time_series)
    np.testing.assert_allclose(ks.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(ks.labels_, ks.predict(time_series))

    assert KShape(n_clusters=101, verbose=False,
                  random_state=rng).fit(time_series)._X_fit is None
    def do_kshape(days, km_size):
        """
        From a time series (as a list of df called days), creates km_size 
        clusters using kshape algo.
        """
        # Arrange data for our lib
        unq = days["n_day_"].unique()
        values = [days[days["n_day_"] == l]["val_"].values for l in unq]
        formatted_dataset = to_time_series_dataset(values)

        # Configure our kmeans
        kshape = KShape(n_clusters=km_size, random_state=42, verbose=False)

        y_pred = kshape.fit_predict(formatted_dataset)

        return kshape, y_pred
Exemple #8
0
def clustering_Kshape(tsdata, n_clusters, random_state, n_init, max_iter=100):
	np.random.seed(random_state)
	# Need to be normalized to calculate cross correlation
	# stack_data = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(stack_data)

	# Instantiate of KShape Class
	ks = KShape(
		n_clusters=n_clusters,
		n_init=n_init,
		verbose=True,
		random_state=random_state,
		max_iter=max_iter
	)
	y_pred = ks.fit_predict(tsdata)

	return y_pred
Exemple #9
0
    def _create_model(self, model):
        """
        A messy function until the latest tslearn is out
        with a fix for array-based hyper-parameters
        """
        # Convert the lists back to arrays
        for k in model['model_params'].keys():
            param = model['model_params'][k]
            if type(param) is list:
                arr = np.array(param)
                if arr.dtype == object:
                    # Then maybe it was rather a list of arrays
                    # This is very hacky...
                    arr = [np.array(p) for p in param]
                model['model_params'][k] = arr

        for k in model['hyper_params'].keys():
            param = model['hyper_params'][k]
            if type(param) is list:
                arr = np.array(param)
                if arr.dtype == object:
                    # Then maybe it was rather a list of arrays
                    # This is very hacky...
                    arr = [np.array(p) for p in param]
                model['hyper_params'][k] = arr
        hyper_params = model['hyper_params']
        model_params = model['model_params']

        inst = KShape(**hyper_params)

        for p in model_params.keys():
            setattr(inst, p, model_params[p])

        return inst
Exemple #10
0
    def plot_elbow(self, data):
        """

        :param df:multi time series  type is np.array
        :return: elbow plot
        """
        distortions = []
        for i in range(2, 7):
            ks = KShape(n_clusters=i,
                        n_init=5,
                        verbose=True,
                        random_state=self.seed)
            ks.fit(data)
            distortions.append(ks.inertia_)
        plt.plot(range(2, 7), distortions, marker='o')
        plt.xlabel('Number of clusters')
        plt.ylabel('Distortion Line')
        plt.show()
Exemple #11
0
 def single_clustering(self, data_raw, data_new, centroid_num, model):
     """
     单次聚类
     :return:
     """
     seed = 0
     np.random.seed(seed)
     labels = []
     inertia = []
     centers = []
     if model == 'K-Means':
         kmeans = KMeans(n_clusters=centroid_num).fit(data_new)
         labels = kmeans.labels_
         centers = kmeans.cluster_centers_
         # 每个点到其簇的质心的距离之和,越小越好
         inertia = kmeans.inertia_
     elif model == 'DTW':
         sdtw_km = TimeSeriesKMeans(n_clusters=centroid_num,
                                    metric='softdtw',
                                    max_iter=2,
                                    max_iter_barycenter=2,
                                    metric_params={
                                        "gamma": 1.0
                                    },
                                    random_state=0,
                                    verbose=True).fit(data_new)
         labels = sdtw_km.labels_
         centers = sdtw_km.cluster_centers_
         inertia = sdtw_km.inertia_
     elif model == "K-Shape":
         ks = KShape(n_clusters=centroid_num,
                     verbose=True,
                     random_state=seed).fit(data_new)
         labels = ks.labels_
         centers = ks.cluster_centers_
         inertia = ks.inertia_
     elif model == "Kernel-KMeans":
         data_new = data_new[:100]
         data_raw = data_raw[:100]
         kk = KernelKMeans(n_clusters=centroid_num,
                           kernel="gak",
                           kernel_params={
                               "sigma": "auto"
                           },
                           max_iter=2,
                           tol=1e-4,
                           verbose=True).fit(data_new)
         labels = kk.labels_
         inertia = kk.inertia_
     D = {}
     for i in range(centroid_num):
         D[i] = []
     for i in range(len(data_raw)):
         D[labels[i]].append(data_raw[i])
     return inertia, D, centers
def kshape(container: DataFrameContainer,
           data_column: str,
           n_clusters: int,
           max_iter: int = 300,
           tol: float = 1e-6,
           n_init: int = 1,
           verbose: bool = True,
           random_state: Union[int, None] = None,
           init: np.ndarray = 'random',
           centroid_seeds: np.ndarray = None):
    """

    :param container:
    :param data_column:
    :param n_clusters:
    :param max_iter:
    :param tol:
    :param n_init:
    :param verbose:
    :param random_state:
    :param init:
    :param centroid_seeds: arrays of shape [n_clusters, ts_size]
    :return:
    """
    if centroid_seeds is not None:
        init = np.swapaxes(np.array([centroid_seeds]).T, 0, 1)

    ks = KShape(n_clusters=n_clusters,
                max_iter=max_iter,
                tol=tol,
                n_init=n_init,
                verbose=verbose,
                random_state=random_state,
                init=init)

    X = np.vstack(container.dataframe[data_column].values)

    y = ks.fit_predict(X)

    container.dataframe['KSHAPE_CLUSTER'] = y

    return container
def kshape_grid_iter(X_partitioned: List[np.array],
                     kshape_kwargs: dict) -> Tuple[KShape, int]:
    seed_ixs = [np.random.randint(0, X.shape[0] - 1) for X in X_partitioned]
    centroid_seeds = np.array(
        [X_partitioned[i][seed] for i, seed in enumerate(seed_ixs)])
    init = np.swapaxes(np.array([centroid_seeds]).T, 0, 1)

    kshape = KShape(n_clusters=len(seed_ixs),
                    init=init,
                    verbose=True,
                    random_state=None,
                    **kshape_kwargs)

    X = np.vstack(X_partitioned)

    print('** Fitting ks model **')
    kshape.fit(X)

    print('** Predicting **')
    n_clusters_out = np.unique(kshape.predict(X)).size

    # until the tslearn hyper-param json issue is released in the latest pypi version
    kshape.init = kshape.init.tolist()

    return kshape, n_clusters_out
Exemple #14
0
 def plot_best_shape(self, data, num_cluster):
     """
     time series cluster plot
     :param df:
     :param num_cluster:
     :return:
     """
     ks = KShape(n_clusters=num_cluster,
                 n_init=5,
                 verbose=True,
                 random_state=self.seed)
     y_pred = ks.fit_predict(data)
     for yi in range(num_cluster):
         for xx in data[y_pred == yi]:
             plt.plot(xx.ravel(), "k-", alpha=.3)
         plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
         plt.text(0.55,
                  0.85,
                  'Cluster %d' % (yi + 1),
                  transform=plt.gca().transAxes)
         plt.tight_layout()
         plt.show()
def run_single(X, train, params, workdir, out):
    kwargs = params
    ks = KShape(**kwargs)

    ks.fit(train)

    print('**** Predicting ****')
    y_pred = ks.predict(X)

    ks_path = os.path.join(workdir, 'ks.pickle')
    pickle.dump(ks, open(ks_path, 'wb'))

    y_pred_path = os.path.join(workdir, 'y_pred.npy')
    np.save(y_pred_path, y_pred)

    train_path = os.path.join(workdir, 'train.npy')
    np.save(train_path, train)

    with open(out, 'w') as f:
        f.write('1')

    print('* Done! *')
Exemple #16
0
def run(data_path: str, params_path: str):
    X = np.load(data_path)

    params = pickle.load(open(params_path, 'rb'))
    workdir = params['workdir']

    out = os.path.join(workdir, 'out')
    with open(out, 'w') as f:
        f.write('0')

    print(f'Using work dir: {workdir}')

    print('** Fitting training data **')
    n_train = int((params['kwargs'].pop('train_percent') / 100) * X.shape[0])
    train = X[np.random.choice(X.shape[0], size=n_train, replace=False)]

    kwargs = params['kwargs']
    ks = KShape(**kwargs)

    ks.fit(train)

    print('**** Predicting ****')
    y_pred = ks.predict(X)

    ks_path = os.path.join(workdir, 'ks.pickle')
    pickle.dump(ks, open(ks_path, 'wb'))

    y_pred_path = os.path.join(workdir, 'y_pred.npy')
    np.save(y_pred_path, y_pred)

    train_path = os.path.join(workdir, 'train.npy')
    np.save(train_path, train)

    with open(out, 'w') as f:
        f.write('1')

    print('* Done! *')
Exemple #17
0
def test_serialize_kshape():
    n, sz, d = 15, 10, 3
    rng = numpy.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    X = TimeSeriesScalerMeanVariance().fit_transform(time_series)

    ks = KShape(n_clusters=3, verbose=True)

    _check_not_fitted(ks)

    ks.fit(X)

    _check_params_predict(ks, X, ['predict'])

    seed_ixs = [numpy.random.randint(0, X.shape[0] - 1) for i in range(3)]
    seeds = numpy.array([X[i] for i in seed_ixs])

    ks_seeded = KShape(n_clusters=3, verbose=True, init=seeds)

    _check_not_fitted(ks_seeded)

    ks_seeded.fit(X)

    _check_params_predict(ks_seeded, X, ['predict'])
Exemple #18
0
 def __init__(
     self,
     num_clusters: int,
     clustering_method: str,
     kmeans_metric: str,
     max_iter: int,
 ):
     super().__init__()
     self.num_clusters = num_clusters
     self.kmeans_metric = kmeans_metric
     self.clustering_method = clustering_method
     self.max_iter = max_iter
     self.test_metric = purity
     self.final_labels = None
     if self.clustering_method == "kshape":
         self.cluster_model = KShape(n_clusters=self.num_clusters,
                                     max_iter=self.max_iter)
     elif self.clustering_method == "kmeans":
         self.cluster_model = TimeSeriesKMeans(
             n_clusters=self.num_clusters,
             metric=self.kmeans_metric,
             max_iter=self.max_iter,
         )
formatted_dataset = to_time_series_dataset(pivoted_series)
print("Data shape: {}".format(formatted_dataset.shape))

formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
    formatted_dataset)
sz = formatted_norm_dataset.shape[1]
print("Data shape: {}".format(sz))

formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
    formatted_dataset)

totalColumn = formatted_norm_dataset.shape[0]
totalRow = formatted_norm_dataset.shape[1]

clusters = 5
ks = KShape(n_clusters=clusters, verbose=True, random_state=seed)
y_pred_ks = ks.fit_predict(formatted_norm_dataset)
formatted_norm_dataset.shape
data = formatted_norm_dataset
data.shape

formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0]
formatted_norm_dataset_2d.shape
#pd.DataFrame(A.T.reshape(2, -1), columns=cols)

df_normalized = pd.DataFrame(formatted_norm_dataset_2d)
df_normalized
#df_normalized = df_normalized.pivot()
# formatted_norm_dataset[0]

df_cluster = pd.DataFrame(y_pred_ks,
def k_shape(X_train, n_clusters, verbose=True, seed=0):
    # Euclidean k-means
    ks = KShape(n_clusters=n_clusters, verbose=verbose, random_state=seed)

    return ks, ks.fit_predict(X_train)
Exemple #21
0
    def train(self, X_train, save_memory=False):
        """
        The training phase of the EUDTR model。

        Args:
            X_train: Train dataset
            save_memory: If True returns path to train data, otherwise to test data
        """

        train_torch_dataset = IndexedDatase(X_train, numpy.array(list(range(X_train.shape[0]))))
        train_generator = torch.utils.data.DataLoader(train_torch_dataset, batch_size=self.batch_size, shuffle=True)

        X_train = X_train.swapaxes(1,2)
        ks = KShape(n_clusters=self.n_clusters).fit(X_train)
        labels = ks.labels_
        X_train = X_train.swapaxes(1,2)

        sc_score = -1

        label2index = list2dict(labels)

        for epoch in range(self.epochs):

            epoch_start = time.time()
            self.encoder = self.encoder.train()
            self.encoder = self.encoder.train()

            for batch_num, batch in enumerate(train_generator):

                loss = 0
                indices, data = batch
                pos_samples, neg_samples = pos_neg_sampling(X_train, labels, label2index, indices, data, self.nb_random_samples)
                pos_samples = torch.from_numpy(pos_samples)
                neg_samples = torch.from_numpy(neg_samples).permute(1, 0, 2, 3)

                data = data.to(self.device)
                pos_samples = pos_samples.to(self.device)
                neg_samples = neg_samples.to(self.device)

                self.encoder_optimizer.zero_grad()
                self.decoder_optimizer.zero_grad()

                ref_embedding = self.encoder(data)
                pos_i_embedding = self.encoder(pos_samples)

                # Calculate the PN-Triplet loss and backward
                loss = -torch.mean(torch.nn.functional.logsigmoid(torch.bmm(
                        ref_embedding.view(data.shape[0], 1, self.out_channels),
                        pos_i_embedding.view(data.shape[0], self.out_channels, 1)
                        )))

                if save_memory:
                    loss.backward(retain_graph=True)
                    loss = 0
                    del pos_i_embedding
                    torch.cuda.empty_cache()

                multiplicative_ratio = self.negative_penalty / self.nb_random_samples

                for i in range(self.nb_random_samples):
                    neg_i_embedding = self.encoder(neg_samples[i])
                    loss += multiplicative_ratio * -torch.mean(torch.nn.functional.logsigmoid(-torch.bmm(
                        ref_embedding.view(data.shape[0], 1, self.out_channels),
                        neg_i_embedding.view(data.shape[0], self.out_channels, 1)
                        )))

                    if save_memory:
                        loss.backward(retain_graph=True)
                        loss = 0
                        del neg_i_embedding
                        torch.cuda.empty_cache()

                # Calculate the MI loss and backward
                self.mi_loss(data, neg_samples, self.encoder, self.decoder, save_memory)
        
                self.encoder_optimizer.step()
                self.decoder_optimizer.step()
         
            epoch_end = time.time()
            print('Train--Epoch: ', epoch + 1, " time: ", epoch_end - epoch_start)

            features = self.encode(X_train, self.batch_size)

            '''
            In order to speed up the convergence of the contour coefficients, 
            it is judged that the contour coefficients are not updated for 3 consecutive times.
            '''
            consecutive_failures = 0
            while True:
                km = KMeans(n_clusters=self.n_clusters).fit(features.reshape(features.shape[0], -1))
                temp_score = silhouette_score(features.reshape(features.shape[0], -1), km.labels_)
                if temp_score > sc_score:
                    consecutive_failures = 0
                    sc_score = temp_score
                    print('sc_score changed:',sc_score)
                    labels = km.labels_
                    label2index = list2dict(labels)
                else:
                    consecutive_failures = consecutive_failures + 1
                    if consecutive_failures == 3:
                        break
from tslearn.clustering import KShape
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

seed = 0
numpy.random.seed(seed)
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = X_train[y_train < 4]  # Keep first 3 classes
numpy.random.shuffle(X_train)
X_train = TimeSeriesScalerMeanVariance().fit_transform(
    X_train[:50])  # Keep only 50 time series
sz = X_train.shape[1]

# Euclidean k-means
ks = KShape(n_clusters=3, verbose=True, random_state=seed)
y_pred = ks.fit_predict(X_train)

plt.figure()
for yi in range(3):
    plt.subplot(3, 1, 1 + yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()
plt.show()
Exemple #23
0
                        (c + 1)] = input_waves[
                            i, (snipLen * fs + 1) * c:(snipLen * fs + 1) *
                            (c + 1)] / max(
                                np.abs(input_waves[i, (snipLen * fs + 1) *
                                                   c:(snipLen * fs + 1) *
                                                   (c + 1)]))
else:
    for i in range(len(input_waves)):
        input_waves[i, :] = input_waves[i, :] / max(np.abs(input_waves[i, :]))

# save result
if skipClustering == 0:

    # run clustering
    print("Clustering...")
    ks = KShape(n_clusters=numCluster, n_init=1,
                random_state=0).fit(input_waves)
    pred = ks.fit_predict(input_waves)

    clustFile = h5py.File(
        templatePath + str(numCluster) + "/" + str(numCluster) +
        "_cluster_predictions_" + str(prefiltFreq[0]) + "-" +
        str(prefiltFreq[1]) + "Hz.h5", "w")
    clustFile.create_dataset("cluster_index", data=pred)
    clustFile.create_dataset("centroids", data=ks.cluster_centers_)
    clustFile.create_dataset("inertia", data=ks.inertia_)
    clustFile.close()

    # load some variables
    centroids = ks.cluster_centers_

if skipClustering:
Exemple #24
0
    y_pred = km.fit_predict(X)
    print(y_pred)
elif args.kmeans_algo == 2:
    k_title = "Soft-DTW k-means"
    f_title = "soft_DTW"
    km = TimeSeriesKMeans(n_clusters=num,
                          metric="softdtw",
                          metric_params={"gamma_sdtw": .01},
                          verbose=True,
                          random_state=seed)
    y_pred = km.fit_predict(X)
    print(y_pred)
else:
    k_title = "KShape"
    f_title = "KShape"
    km = KShape(n_clusters=num, verbose=True, random_state=seed)
    y_pred = km.fit_predict(X)
    print(y_pred)

plt.figure()
for yi in range(num):
    plt.subplot(num, 1, yi + 1)
    for xx in X[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    if y_min < 100 and y_max > -100:
        plt.ylim(y_min - 1, y_max + 1)
    if yi == 0:
        plt.title(k_title)
    if args.anon:
Exemple #25
0
def main(argv):
    # define global timer to obtain global execution time
    start_global = timer()
    
    # define globals variables
    global euclidean_clustered_data, \
        dtw_clustered_data, \
        soft_dtw_clustered_data, \
        k_shape_clustered_data, \
        gak_clustered_data
    
    #############################################################################################
    # Input arguments parsing
    #############################################################################################
    
    # define help message
    help_message = \
        'clustering.py -h \n\n' \
        'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \
        'by default: processing input data (without any sampling)' \
        '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \
        'options list: \n' \
        '  -c / --clusters <number_clusters>  # set number of clusters (default 3) \n\n' \
        '  -i / --ifile <input_file>          # set input filename \n' \
        '  -n / --normalise                   # normalise input data \n' \
        '  -s / --standardise                 # standardise input data \n\n' \
        '  -a / --all                         # perform all 5 implemented methods of clustering: \n' \
        '                                       euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \
        '  -E / --euclidean                   # perform euclidean k-means clustering \n' \
        '  -D / --dtw                         # perform dtw k-means clustering \n' \
        '  -S / --soft-dtw                    # perform soft-dtw k-means clustering \n' \
        '  -K / --k-shape                     # perform k-shape clustering \n' \
        '  -G / --gak                         # perform GAK k-means clustering \n'
    
    # Create new object to save arguments
    i_args = Arguments()
    
    # number of rows in plot to create correct number of subplots
    # default = 3 (raw data plus distribution histograms)
    n_rows_plot = 3
    
    # define validation rules for arguments
    try:
        opts, args = getopt.getopt(
            argv,
            "hc:i:nsaEDSKG",
            [
                "help",
                "clusters=",
                "ifile=",
                "normalise",
                "standardise",
                "all",
                "euclidean",
                "dtw",
                "soft-dtw",
                "k-shape",
                "gak"
            ]
        )
    except getopt.GetoptError:
        print(help_message)
        sys.exit(2)
    
    # parse arguments
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print(help_message)
            sys.exit()
        elif opt in ("-c", "--clusters"):
            i_args.number_clusters = arg
        elif opt in ("-i", "--ifile"):
            i_args.input_file = arg
        elif opt in ("-n", "--normalise"):
            i_args.normalise_data = True
        elif opt in ("-s", "--standardise"):
            i_args.standardise_data = True
        elif opt in ("-E", "--euclidean"):
            n_rows_plot += 1
            i_args.euclidean_clustering = True
        elif opt in ("-D", "--dtw"):
            n_rows_plot += 1
            i_args.dtw_clustering = True
        elif opt in ("-S", "--soft-dtw"):
            n_rows_plot += 1
            i_args.soft_dtw_clustering = True
        elif opt in ("-K", "--k-shape"):
            n_rows_plot += 1
            i_args.k_shape_clustering = True
        elif opt in ("-G", "--gak"):
            n_rows_plot += 1
            i_args.gak_clustering = True
        elif opt in ("-a", "--all"):
            n_rows_plot = 8
            i_args.euclidean_clustering = True
            i_args.dtw_clustering = True
            i_args.soft_dtw_clustering = True
            i_args.k_shape_clustering = True
            i_args.gak_clustering = True
    
    # normalise maximum number of subplots levels
    n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot
    
    #############################################################################################
    # Raw data processing stage
    #############################################################################################
    
    # set style to matplotlib plot
    mpl.style.use('seaborn')
    
    # set seed value and seed the generator
    seed = 0
    numpy.random.seed(seed)
    
    # import data and print first 5 rows
    raw_data = import_data()
    print(raw_data.head())
    
    # convert raw data to the format which can be used by tslearn
    # (3-d dimensional array)
    # BUILT functionality: adjust all time series to one size
    # (NaN values are appended to the shorter ones)
    formatted_data = to_time_series_dataset(raw_data)
    
    # print shape of new array
    print(formatted_data.shape)
    
    # obtain number of measuring
    n_measuring = formatted_data.shape[1]
    
    # define figure, grid_spec to create layout of the plot
    fig = plt.figure(constrained_layout=True)
    grid_spec = fig.add_gridspec(
        n_rows_plot,
        i_args.number_clusters
    )
    
    # set A4 size to figure
    fig.set_size_inches(8.5, 11.75)
    
    # setup count of layers of subplots
    count_layer = 3
    # setup first subplot and draw raw time series
    f_ax_raw_data = fig.add_subplot(grid_spec[:2, :])
    
    for xx in formatted_data:
        f_ax_raw_data.plot(xx.ravel(), alpha=.2)
    
    formatted_data_min = formatted_data.min()
    formatted_data_max = formatted_data.max()
    # draw title for chart with min and max values
    f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max))

    # obtain and print executing time of data processing stage to console,
    timer_tick = get_time_tick(start_global)
    plt.ion()
    plt.show()
    
    print("Raw data processing time: %s" % timer_tick)
    
    #############################################################################################
    # Data preprocessing stage
    #############################################################################################
    
    start = timer()
    
    # Convert NaNs to value predicted by interpolation
    # linearly interpolate for NaN/NaNs
    n_nan_changes = 0
    for ind in range(formatted_data.shape[0]):
        mask = numpy.isnan(formatted_data[ind])
        n_nan_changes += mask.sum()
        formatted_data[ind][mask] = numpy.interp(
            numpy.flatnonzero(mask),
            numpy.flatnonzero(~mask),
            formatted_data[ind][~mask]
        )
    print("%d NaN values was/were interpolated" % n_nan_changes)
    
    # Scaling
    # to know should we use normalization or standardization, we need to see
    # the distribution of values.
    
    # take random 3 measuring for each case to draw histograms
    random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False)
    
    # create new arrays with values of randomly chosen measurements
    histogram_data = formatted_data[:, random_indexes]
    
    # draw histograms
    for i_histogram in range(i_args.number_clusters):
        f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram])
        f_ax_histogram.hist(
            histogram_data[:, i_histogram],
            bins=25, density=True
        )
        
        f_ax_histogram.text(0.55, 0.98,
                            'Measurement #%d' % random_indexes[i_histogram],
                            transform=plt.gca().transAxes,
                            color="navy"
                            )
        if i_histogram == 1:
            preprocessing = ''
            if i_args.normalise_data:
                preprocessing += "normalised"
                if i_args.standardise_data:
                    preprocessing += " and standardised"
            elif i_args.standardise_data:
                preprocessing += "standardised"

            preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing
            f_ax_histogram.set_title(
                "Distributions histograms %s" % preprocessing,
                color='navy', y=1, pad=14
            )
    
    # if no processing data option chosen continue with raw data
    processed_data = formatted_data
    
    # since for this concrete challenge data the distributions are more/less
    # Gaussian/Normal we can use standardization
    
    # normalize data: Min-Max scaling ranging between 0 and 1
    if i_args.normalise_data:
        processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data)
        print("Data was normalised")
    
    # standardize data: scaling technique where the values are centered around
    # the mean with a unit standard deviation
    if i_args.standardise_data:
        processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data)
        print("Data was standardised")
    
    # obtain max value of data (to be used in visualization subplots)
    max_data = processed_data.max() * 1.2
    min_data = processed_data.min() * 1.2
    
    timer_tick = get_time_tick(start)
    print("#############################################################################################")
    print("Data processing stage elapsed time: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Euclidean k-means clustering algorithm
    #############################################################################################
    
    if i_args.euclidean_clustering:
        
        start = timer()
        print("Euclidean k-means")
        
        # define parameters of the model of the algorithm
        k_means_euclidean = TimeSeriesKMeans(
            n_clusters=i_args.number_clusters,
            verbose=True,
            random_state=seed,
            n_jobs=4
        )
        
        # calculate cluster's label array
        euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data)
        
        # draw subplots with attributed clusters of time series as well as
        # cluster centers' lines
        for i_cluster in range(i_args.number_clusters):
            f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                n_measuring, min_data, max_data,
                                                processed_data, euclidean_clustered_data, 'tab:blue')
            
            f_ax_euclidean.plot(
                k_means_euclidean.cluster_centers_[i_cluster].ravel(),
                "tab:green"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_euclidean
        
        # increment count of filled layer of subplots
        count_layer += 1
        
        # obtain processing time, print it to console and
        # add it to the title of the series of subplots
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Euclidean $k$-means (%s)" % timer_tick,
            color='tab:green', y=1, pad=14
        )
        print("#############################################################################################")
        print("Euclidean k-means time processing: %s" % timer_tick)
        
    #############################################################################################
    # Implementing DTW k-means clustering algorithm
    # use dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.dtw_clustering:
        
        start = timer()
        print("DTW k-means")
        k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                       n_init=3,
                                       metric="dtw",
                                       verbose=True,
                                       max_iter_barycenter=10,
                                       random_state=seed,
                                       n_jobs=6
                                       )
        dtw_clustered_data = k_means_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                          n_measuring, min_data, max_data,
                                          processed_data, dtw_clustered_data, 'tab:blue')
            
            f_ax_dtw.plot(
                k_means_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:red"
            )
            if i_cluster == 1:
                middle_axis = f_ax_dtw

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "DTW $k$-means (%s)" % timer_tick,
            color='tab:red', y=1, pad=14
        )
        print("#############################################################################################")
        print("DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing soft DTW k-means clustering algorithm
    # use soft dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.soft_dtw_clustering:
        
        start = timer()
        print("Soft-DTW k-means")
        k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                            metric="softdtw",
                                            metric_params={"gamma": .025},
                                            verbose=True,
                                            random_state=seed,
                                            n_jobs=6
                                            )
        soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                               n_measuring, min_data, max_data,
                                               processed_data, soft_dtw_clustered_data, 'tab:blue')
            
            f_ax_soft_dtw.plot(
                k_means_soft_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:purple"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_soft_dtw

        # increment count of filled layer of subplots
        count_layer += 1

        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Soft-DTW $k$-means (%s)" % timer_tick,
            color='tab:purple', y=1, pad=14
        )
        print("#############################################################################################")
        print("Soft-DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing k-Shape clustering algorithm
    #############################################################################################
    
    if i_args.k_shape_clustering:
        
        start = timer()
        print("K-Shape")
        k_shape = KShape(n_clusters=i_args.number_clusters,
                         verbose=True,
                         random_state=seed
                         )
        k_shape_clustered_data = k_shape.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            
            min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min())
            max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max())
            
            f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                              n_measuring, min_axe_value, max_axe_value,
                                              processed_data, k_shape_clustered_data, 'tab:blue')
            
            f_ax_k_shape.plot(
                k_shape.cluster_centers_[i_cluster].ravel(),
                "tab:orange"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_k_shape

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "$K$-Shape (%s)" % timer_tick,
            color='tab:orange', y=1, pad=14
        )
        print("#############################################################################################")
        print("K-Shape time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Global Alignment kernel k-means clustering algorithm
    # since kernel is used, there is no centroid of the cluster
    #############################################################################################
    
    if i_args.gak_clustering:
        
        start = timer()
        print("GAK-k-means")
        gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters,
                                   kernel="gak",
                                   kernel_params={"sigma": "auto"},
                                   n_init=10,
                                   verbose=True,
                                   random_state=seed,
                                   n_jobs=6
                                   )
        
        gak_clustered_data = gak_k_means.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                  n_measuring, min_data, max_data,
                                                  processed_data, gak_clustered_data, 'tab:blue')
            
            if i_cluster == 1:
                middle_axis = f_ax_gak_k_means

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Global Alignment kernel $k$-means (%s)" % timer_tick,
            color='tab:cyan', y=1, pad=14)
        print("#############################################################################################")
        print("GAK k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    
    # return string with current datetime
    now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

    # define the name of the directory to be created
    path = "./out/%s" % now

    print("#############################################################################################")
    try:
        os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
    else:
        print("Successfully created the directory %s " % path)
    
    try:
        # save figure as pdf to out folder
        fig.savefig("./out/%s/visual_result.pdf" % now)
    
        # save clustering results
        if i_args.euclidean_clustering:
            numpy.savetxt(
                "./out/%s/euclidean_clustering_result.csv" % now,
                euclidean_clustered_data,
                delimiter=","
            )
        if i_args.dtw_clustering:
            numpy.savetxt(
                "./out/%s/dtw_clustering_result.csv" % now,
                dtw_clustered_data,
                delimiter=","
            )
        if i_args.soft_dtw_clustering:
            numpy.savetxt(
                "./out/%s/soft_dtw_clustering_result.csv" % now,
                soft_dtw_clustered_data,
                delimiter=","
            )
        if i_args.k_shape_clustering:
            numpy.savetxt(
                "./out/%s/k_shape_clustering_result.csv" % now,
                k_shape_clustered_data,
                delimiter=","
            )
        if i_args.gak_clustering:
            numpy.savetxt(
                "./out/%s/gak_clustering_result.csv" % now,
                gak_clustered_data,
                delimiter=","
            )
    except RuntimeError:
        print("Saving results failed")
    else:
        print("Successfully saved results in the path %s " % path)

    #############################################################################################
    
    # obtain and print global executing time
    timer_tick = get_time_tick(start_global)
    print("#############################################################################################")
    print("All algorithms elapsed time: % s" % timer_tick)
    
    #############################################################################################

    # render and show plot
    # plt.show()
    plt.draw()
    plt.pause(0.001)
    input("Press [enter] to finish.")
    print("#############################################################################################")
                                         random_state=2019)

X_train = to_time_series_dataset(data_train[:, 1:])
y_train = data_train[:, 0].astype(np.int)
X_test = to_time_series_dataset(data_test[:, 1:])
y_test = data_test[:, 0].astype(np.int)
file_name = "教師なし教科書\\13章-時系列クラスタリング\\4_ECG5000_k_shape\\result\\"

# Prepare the data - Scale
X_train = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_train)
X_test = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_test)

# Train using k-Shape
ks = KShape(n_clusters=5,
            max_iter=100,
            n_init=10,
            verbose=1,
            random_state=2019)
ks.fit(X_train)
with open(file_name + 'result.txt', 'w') as f:
    # Predict on train set and calculate adjusted Rand index
    preds = ks.predict(X_train)
    ars = adjusted_rand_score(data_train[:, 0], preds)
    print("Adjusted Rand Index on Training Set:", ars, file=f)

    preds_test = ks.predict(X_test)
    ars = adjusted_rand_score(data_test[:, 0], preds_test)
    print("Adjusted Rand Index on Test Set:", ars, file=f)

    # Evaluate goodness of the clusters
    preds_test = preds_test.reshape(1000, 1)
y_train = data_train[:, 0].astype(np.int)

data_test = np.loadtxt(current_path + file +
                       "ECGFiveDays\\ECGFiveDays_TEST.tsv")
X_test = to_time_series_dataset(data_test[:, 1:])
y_test = data_test[:, 0].astype(np.int)
file = "教師なし教科書\\13章-時系列クラスタリング\\3_ECGFiveDays_k_shape\\result\\"

# Prepare the data - Scale
X_train = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_train)
X_test = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_test)


# k-Shape Algorithm
# Train using k-Shape
ks = KShape(n_clusters=2, max_iter=100, n_init=100, verbose=0)
ks.fit(X_train)

# Make predictions on train set and calculate adjusted Rand index
preds = ks.predict(X_train)
ars = adjusted_rand_score(data_train[:, 0], preds)
print("Adjusted Rand Index:", ars)

# Make predictions on test set and calculate adjusted Rand index
preds_test = ks.predict(X_test)
ars = adjusted_rand_score(data_test[:, 0], preds_test)
print("Adjusted Rand Index on Test Set:", ars)

# 訓練セットがちいさいから結果が悪い train 23 test 861
# Adjusted Rand Index: 0.668041237113402
# Adjusted Rand Index on Test Set: 0.012338817789874643
Exemple #28
0
# scale mean around zero
input_waves = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(waves)

# run clustering or skip and load results if desired
if skipClustering:
    clustFile = h5py.File(
        templatePath + str(numCluster) + "/" + str(numCluster) +
        "_cluster_predictions_" + str(prefiltFreq[0]) + "-" +
        str(prefiltFreq[1]) + "Hz.h5", "r")
    pred = np.array(list(clustFile["cluster_index"]))
    centroids = list(clustFile["centroids"])
    clustFile.close()
else:
    print("Clustering...")
    ks = KShape(n_clusters=numCluster, n_init=1, random_state=0)
    pred = ks.fit_predict(input_waves)

    clustFile = h5py.File(
        templatePath + str(numCluster) + "/" + str(numCluster) +
        "_cluster_predictions_" + str(prefiltFreq[0]) + "-" +
        str(prefiltFreq[1]) + "Hz.h5", "w")
    clustFile.create_dataset("cluster_index", data=pred)
    clustFile.create_dataset("centroids", data=ks.cluster_centers_)
    clustFile.create_dataset("inertia", data=ks.inertia_)
    clustFile.close()

    modelFile = templatePath + str(numCluster) + "/" + str(
        numCluster) + "_cluster_model_" + str(prefiltFreq[0]) + "-" + str(
            prefiltFreq[1]) + "Hz.h5"
    ks.to_hdf5(modelFile)
Exemple #29
0
class TimeSeriesKShapes(BaseClusterer):
    """Kshape algorithm wrapper tslearns implementation.

    Parameters
    ----------
    n_clusters: int, defaults = 8
        The number of clusters to form as well as the number of
        centroids to generate.
    init_algorithm: str or np.ndarray, defaults = 'random'
        Method for initializing cluster centers. Any of the following are valid:
        ['random']. Or a np.ndarray of shape (n_clusters, ts_size, d) and gives the
        initial centers.
    n_init: int, defaults = 10
        Number of times the k-means algorithm will be run with different
        centroid seeds. The final result will be the best output of n_init
        consecutive runs in terms of inertia.
    max_iter: int, defaults = 30
        Maximum number of iterations of the k-means algorithm for a single
        run.
    tol: float, defaults = 1e-4
        Relative tolerance with regards to Frobenius norm of the difference
        in the cluster centers of two consecutive iterations to declare
        convergence.
    verbose: bool, defaults = False
        Verbosity mode.
    random_state: int or np.random.RandomState instance or None, defaults = None
        Determines random number generation for centroid initialization.

    Attributes
    ----------
    labels_: np.ndarray (1d array of shape (n_instance,))
        Labels that is the index each time series belongs to.
    inertia_: float
        Sum of squared distances of samples to their closest cluster center, weighted by
        the sample weights if provided.
    n_iter_: int
        Number of iterations run.
    """

    _tags = {
        "capability:multivariate": True,
    }

    def __init__(
        self,
        n_clusters: int = 8,
        init_algorithm: Union[str, np.ndarray] = "random",
        n_init: int = 10,
        max_iter: int = 300,
        tol: float = 1e-4,
        verbose: bool = False,
        random_state: Union[int, RandomState] = None,
    ):
        _check_soft_dependencies("tslearn", severity="error", object=self)

        self.init_algorithm = init_algorithm
        self.n_init = n_init
        self.max_iter = max_iter
        self.tol = tol
        self.verbose = verbose
        self.random_state = random_state

        self.cluster_centers_ = None
        self.labels_ = None
        self.inertia_ = None
        self.n_iter_ = 0

        self._tslearn_k_shapes = None

        super(TimeSeriesKShapes, self).__init__(n_clusters=n_clusters)

    def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Fit time series clusterer to training data.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Training time series instances to cluster.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        self:
            Fitted estimator.
        """
        from tslearn.clustering import KShape

        if self._tslearn_k_shapes is None:
            self._tslearn_k_shapes = KShape(
                # n_clusters=self.n_clusters,
                n_clusters=3,
                max_iter=self.max_iter,
                tol=self.tol,
                random_state=self.random_state,
                n_init=self.n_init,
                verbose=self.verbose,
                init=self.init_algorithm,
            )

        self._tslearn_k_shapes.fit(X)
        self._cluster_centers = self._tslearn_k_shapes.cluster_centers_
        self.labels_ = self._tslearn_k_shapes.labels_
        self.inertia_ = self._tslearn_k_shapes.inertia_
        self.n_iter_ = self._tslearn_k_shapes.n_iter_

    def _predict(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Predict the closest cluster each sample in X belongs to.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Time series instances to predict their cluster indexes.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        np.ndarray (1d array of shape (n_instances,))
            Index of the cluster each time series in X belongs to.
        """
        return self._tslearn_k_shapes.predict(X)

    @classmethod
    def get_test_params(cls, parameter_set="default"):
        """Return testing parameter settings for the estimator.

        Parameters
        ----------
        parameter_set : str, default="default"
            Name of the set of test parameters to return, for use in tests. If no
            special parameters are defined for a value, will return `"default"` set.


        Returns
        -------
        params : dict or list of dict, default = {}
            Parameters to create testing instances of the class
            Each dict are parameters to construct an "interesting" test instance, i.e.,
            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
            `create_test_instance` uses the first (or only) dictionary in `params`
        """
        params = {
            "n_clusters": 2,
            "init_algorithm": "random",
            "n_init": 1,
            "max_iter": 1,
            "tol": 1e-4,
            "verbose": False,
            "random_state": 1,
        }
        return params

    def _score(self, X, y=None):
        return np.abs(self.inertia_)
Exemple #30
0
def mass_upload(startDate, endDate, id_unit_usaha):
    print(id_unit_usaha)
    login = ""
    password = ""
    # engine = sqlalchemy.create_engine('mysql+pymysql://energy:energy2x5=10@localhost:3306/pgn')
    engine = sqlalchemy.create_engine(
        'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    sql = " SELECT a.IDREFPELANGGAN, a.ID_UNIT_USAHA, 1 AS FSTREAMID, DATEPART(dw, a.FDATETIME) as FDAYOFWEEK, a.FHOUR, AVG(a.FDVC) as AVG_FDVC\
            FROM(SELECT IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR, SUM(FDVC) as FDVC\
                FROM amr_bridge\
                WHERE FDATETIME >= '" + startDate + "'\
                and FDATETIME < '" + endDate + "'\
                GROUP BY IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR) a\
            GROUP BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR\
            ORDER BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR"

    df = pd.read_sql_query(sql, engine)
    totaldf = len(df)
    totaldf = str(totaldf)
    print('total Data: ' + totaldf)

    # rslt_df = df.loc[df['ID_UNIT_USAHA'] == '014']

    # print(startDate)
    # print('\nResult dataframe :\n', rslt_df)

    # df.to_csv('pgn_customer_cluster_v1_{}.csv'.format(id_unit_usaha), index=False)

    # df.to_hdf("amr_bridge_22122020.hdf", key='hdf5')

    # df = pd.read_hdf("amr_bridge_22122020.hdf")

    def select_data(id_unit):
        query = "ID_UNIT_USAHA == '{}'".format(id_unit_usaha)
        columns = ['FDAYOFWEEK', 'FHOUR', 'IDREFPELANGGAN', 'AVG_FDVC']

        # df = df.set_index('FDATETIME')
        df_selected = df.query(query, engine='python')[columns]
        return df_selected

    def pivot_data(df):
        # df_pivoted = df.pivot(index='FDATETIME', columns='IDREFPELANGGAN', values='FDVC')
        df_pivoted = df.pivot(index=['FDAYOFWEEK', 'FHOUR'],
                              columns='IDREFPELANGGAN',
                              values='AVG_FDVC')
        return df_pivoted

    def remove_zerocolumns(df):
        # Get all columns which have all zero values
        cols = df.columns[df.mean() == 0]
        # Drop columns which has all zero values
        df = df.drop(cols, axis=1)
        return df

    df_week1 = select_data(id_unit_usaha)
    df_week1.fillna(0.0, inplace=True)

    df_pivoted1 = pivot_data(df_week1)
    df_pivoted1.fillna(0.0, inplace=True)

    df_pivoted1 = remove_zerocolumns(df_pivoted1)
    cols = list(df_pivoted1.columns)
    df_pivoted1.head()

    # Function to plot cluster

    # def plot_clusters(ds, y_pred, n_clusters, ks, filename):
    #     plt.figure(figsize=(12, 40))
    #     for yi in range(n_clusters):
    #         plt.subplot(n_clusters, 1, 1 + yi)
    #         for xx in ds[y_pred == yi]:
    #             plt.plot(xx.ravel(), "k-", alpha=.2)
    #         plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
    #         plt.xlim(0, sz)
    #         plt.ylim(-7, 7)
    #         plt.title("Cluster %d" % (yi))

    #     plt.tight_layout()
    #     plt.savefig(filename, format='jpg', dpi=300, quality=95)
    #     plt.show()

    def create_cluster_info(y_pred, cols):

        df_cluster = pd.DataFrame(y_pred.copy(),
                                  index=cols.copy(),
                                  columns=['cluster'])
        df_cluster.reset_index(inplace=True)
        df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True)

        unique_cluster = df_cluster['cluster'].unique()

        # Get ID ref based on cluster
        idrefs_list = []
        for i, x in enumerate(unique_cluster):
            idref_list = df_cluster.query(
                "cluster == {}".format(x))['idrefpelanggan'].values.tolist()
            # idrefs_list[x] = idref_list

            # Create dictionary
            idref_cluster_dict = {'cluster': x, 'idrefpelanggan': idref_list}
            idrefs_list.append(idref_cluster_dict)

        idrefs_cluster = pd.DataFrame(idrefs_list)
        return idrefs_cluster

    # def run_once(startime, totalData, _has_run=[]):
    #     if _has_run:
    #         return
    #     # print("run_once doing stuff")
    #     print(startime)
    #     endtime = time.time_ns()
    #     print(endtime)
    #     invTime = endtime-startime

    #     estTime = invTime * totalData
    #     _has_run.append(1)

    #     print(totalData)
    #     print(estTime)
    #     return estTime

    seed = 0
    np.random.seed(seed)

    # Convert data frame to list of series
    pivoted_series = []
    pivoted_columns = []
    for i, y in enumerate(cols):
        length = len(df_pivoted1[y])
        cst = df_pivoted1[y].values
        pivoted_series.append(cst)
        pivoted_columns.append(y)

        # Convert data set to standar time series format
    formatted_dataset = to_time_series_dataset(pivoted_series)
    print("Data shape: {}".format(formatted_dataset.shape))

    formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
        formatted_dataset)
    sz = formatted_norm_dataset.shape[1]
    print("Data shape: {}".format(sz))

    formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
        formatted_dataset)
    clusters = 5
    totalColumn = formatted_norm_dataset.shape[0]
    totalRow = formatted_norm_dataset.shape[1]
    totalData = totalRow * totalColumn + totalRow * clusters

    ks = KShape(n_clusters=clusters, verbose=True, random_state=seed)
    y_pred_ks = ks.fit_predict(formatted_norm_dataset)
    formatted_norm_dataset.shape
    data = formatted_norm_dataset
    data.shape

    formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0]
    formatted_norm_dataset_2d.shape
    # pd.DataFrame(A.T.reshape(2, -1), columns=cols)

    df_normalized = pd.DataFrame(formatted_norm_dataset_2d)
    df_normalized
    # df_normalized = df_normalized.pivot()
    # formatted_norm_dataset[0]

    df_cluster = pd.DataFrame(y_pred_ks,
                              index=pivoted_columns,
                              columns=['cluster'])
    df_cluster.reset_index(inplace=True)
    df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True)
    df_cluster.sort_values(['cluster'])

    df_normalized_detail = pd.DataFrame.join(df_normalized, df_cluster)
    df_normalized_detail

    # df_cluster.to_csv('pgn_customer_cluster_{}.csv'.format(
    #     id_unit_usaha), index=False)

    # Create data frame for customer and its cluster
    create_cluster_info(y_pred_ks, cols)

    # plot_clusters(formatted_norm_dataset, y_pred_ks, clusters, ks,
    #               'pgn_customer_cluster_{}.jpg'.format(id_unit_usaha))

    # engine2 = sqlalchemy.create_engine(
    #     'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    # Session = sessionmaker(bind=engine2)
    # session = Session()

    # Base = declarative_base()

    # class PL_CUSTOMER_CLUSTER(Base):

    #     __tablename__ = 'PL_CUSTOMER_CLUSTER'

    #     ID = Column(Integer, primary_key=True)
    #     DATE_STAMP = Column(DateTime)
    #     IDREFPELANGGAN = Column(String(30))
    #     HOUR_NUM = Column(Integer)
    #     CLUSTER_NUM = Column(Integer)
    #     HOUR_NUM = Column(Integer)
    #     FDVC_NORMALIZED = Column(Float)
    #     AREA_ID = Column(String(5))
    # startime = time.time_ns()
    # for i in range(totalColumn):

    #     idref = df_normalized_detail.iloc[i, totalRow]
    #     cluster = int(df_normalized_detail.iloc[i, totalRow+1])
    #     print("idref = " + idref)
    #     cluster_num = df_normalized_detail.iloc[i, totalRow-1]
    #     for j in range(totalRow):

    #         hour_num = df_normalized_detail.columns[j]
    #         fdvc = df_normalized_detail.iloc[i, j]

    #         sql = ""

    #         # insert into table
    #         item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=idref,
    #                                    HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc, AREA_ID=id_unit_usaha)
    #         session.add(item)

    #     # commit per id ref pelanngan
    #     session.commit()

    engine2 = sqlalchemy.create_engine(
        'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    Session = sessionmaker(bind=engine2)
    session = Session()

    Base = declarative_base()

    class PL_CUSTOMER_CLUSTER(Base):
        __tablename__ = 'PL_CUSTOMER_CLUSTER'

        ID = Column(Integer, primary_key=True)
        DATE_STAMP = Column(DateTime)
        IDREFPELANGGAN = Column(String(30))
        HOUR_NUM = Column(Integer)
        CLUSTER_NUM = Column(Integer)
        HOUR_NUM = Column(Integer)
        FDVC_NORMALIZED = Column(Float)
        AREA_ID = Column(String(5))

    df_normalized_detail

    for i in range(5):
        print("cluster: " + str(i))
        CLUSTER_NAME = "CENTROID_ID" + str(i)
        cluster = i
        for j in range(totalRow):
            fdvc_norm = ks.cluster_centers_[i][j][0]
            hour_num = j

            sql = ""
            item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate,
                                       IDREFPELANGGAN=CLUSTER_NAME,
                                       HOUR_NUM=hour_num,
                                       CLUSTER_NUM=cluster,
                                       FDVC_NORMALIZED=fdvc_norm,
                                       AREA_ID=id_unit_usaha)
            session.add(item)
            print("fdvc:" + str(fdvc_norm) + "Hour:" + str(hour_num))
        # commit per id ref pelanngan
        session.commit()
        print(str(j) + ", " + str(fdvc_norm))

    return totalData