Ejemplo n.º 1
0
def elbowmethod(df: pd.DataFrame):
    """
    Function that finds the best number of clusters and plots a graph of the distortion with the number of clusters
    Args:
        pd.DataFrame: Columns are (country(index), year_week, value)
    Returns:
        int: Best number of cluster 
    """
    import numpy as np
    from tslearn.clustering import silhouette_score
    import matplotlib.pyplot as plt

    distortions = []
    inertias = []
    mapping1 = {}
    mapping2 = {}

    K = range(2, df.shape[0])
    for k in K:
        # Building and fitting the model
        model = TimeSeriesKMeans(n_clusters=k, metric="softdtw", max_iter=50)
        model.fit(df.values[..., np.newaxis])
        distortions.append(
            silhouette_score(df, model.labels_, metric="softdtw"))

    plt.plot(K, distortions, 'bx-')
    plt.xlabel('Values of K')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method using Distortion')
    plt.show()

    best_num_cluster = np.argmax(distortions) + 2
    return best_num_cluster
Ejemplo n.º 2
0
def test_serialize_timeserieskmeans():
    n, sz, d = 15, 10, 3
    rng = numpy.random.RandomState(0)
    X = rng.randn(n, sz, d)

    dba_km = TimeSeriesKMeans(n_clusters=3,
                              n_init=2,
                              metric="dtw",
                              verbose=True,
                              max_iter_barycenter=10)

    _check_not_fitted(dba_km)

    dba_km.fit(X)

    _check_params_predict(dba_km, X, ['predict'])

    sdtw_km = TimeSeriesKMeans(n_clusters=3,
                               metric="softdtw",
                               metric_params={"gamma": .01},
                               verbose=True)

    _check_not_fitted(sdtw_km)

    sdtw_km.fit(X)

    _check_params_predict(sdtw_km, X, ['predict'])
Ejemplo n.º 3
0
    def extract_clusters(self, new_featues):
        print("Extracting clusters ...")

        km = TimeSeriesKMeans(n_clusters=2, random_state=42)
        km.fit(new_featues)
        y_label = km.labels_
        new_featues['km_clusters'] = y_label

        print("Extracting clusters ... DONE.")
        return new_featues
Ejemplo n.º 4
0
def cluster_annotation_dimension(data, n_clusters=3):
    data = np.array([list(moving_average(d, 2)) for d in data])

    clustering = TimeSeriesKMeans(n_clusters=n_clusters, metric="euclidean")
    clustering.fit(data)

    clusters = []
    for cluster_ix in range(n_clusters):
        cl = np.where(clustering.labels_ == cluster_ix)[0]
        clusters.append(data[cl])

    return clusters
Ejemplo n.º 5
0
def init(X, l, k):
    # Good initial start improves the convergence speed
    seed = 0
    sdtw_km = TimeSeriesKMeans(n_clusters=k,
                               metric="euclidean",
                               max_iter=10,
                               random_state=seed)
    sdtw_km.fit(X)
    G_init = np.zeros((sdtw_km.labels_.size, sdtw_km.labels_.max() + 1))
    G_init[np.arange(sdtw_km.labels_.size), sdtw_km.labels_] = 1
    G_init = G_init + np.random.rand(G_init.shape[0], G_init.shape[1])
    F_init = sdtw_km.cluster_centers_[:, :, 0] + 2**psi * np.random.rand(k, l)
    return F_init, G_init
Ejemplo n.º 6
0
def get_fitted_model(dataset, clusters = 3, timepoints = 0, verbose = False,
                     return_dataset = True):
    if timepoints!=0:
        dataset = dataset[:,:timepoints]
    if verbose: print(f'Segmenting data into {clusters} clusters...')
    model = TimeSeriesKMeans(
        n_clusters = clusters,
        n_init = 10,
        metric = 'dtw',         #Dynamic time warping
        verbose = verbose,
        n_jobs = -1             #Use all cores
        )
    model.fit(dataset)
    return model
Ejemplo n.º 7
0
def main():
    X1 = to_time_series_dataset(mock_dataset_muscle1)
    y1 = mock_labels
    X_train1 = X1[:-2]
    y_train1 = y1[:-2]
    X_test1 = X1[-2:]
    y_test1 = y1[-2:]
    # clf1 = KNeighborsTimeSeriesClassifier(n_neighbors=5, metric="dtw")
    clf1 = TimeSeriesKMeans(metric="dtw")
    clf1.fit(X_train1, y_train1)
    pred_train1 = clf1.predict(X_train1)
    pred_test1 = clf1.predict(X_test1)
    print("TRAINING SET 1")
    print("Prediction: " + str(pred_test1))
    print("Actual: " + str(y_test1))

    print("\n")

    X2 = to_time_series_dataset(mock_dataset_muscle2)
    y2 = mock_labels
    X_train2 = X2[:-2]
    y_train2 = y2[:-2]
    X_test2 = X2[-2:]
    y_test2 = y2[-2:]
    clf2 = TimeSeriesKMeans(metric="dtw")
    # clf2 = KNeighborsTimeSeriesClassifier(n_neighbors=5, metric="dtw")
    clf2.fit(X_train2, y_train2)
    pred_train2 = clf2.predict(X_train2)
    pred_test2 = clf2.predict(X_test2)
    print("TRAINING SET 2")
    print("Prediction: " + str(pred_test2))
    print("Actual: " + str(y_test2))

    print("\n")

    times_train = mock_times[:-2]
    times_test = mock_times[-2:]
    X_train = np.stack((pred_train1, pred_train2, times_train)).transpose()
    X_test = np.stack((pred_test1, pred_test2, times_test)).transpose()
    y_train = np.array(mock_labels[:-2]).reshape((len(X_train), ))
    y_test = mock_labels[-2:]
    sgd = SGDClassifier()
    sgd.fit(X_train, y_train)
    pred = sgd.predict(X_test)
    print("ENSEMBLE")
    print("Prediction: " + str(pred))
    print("Actual: " + str(y_test))
    print("Score: " + str(sgd.score(X_test, y_test)))
Ejemplo n.º 8
0
def run():
    parser = cli_parser()
    args = parser.parse_args()

    nii = image.index_img(args.input, slice(0, 30))
    masker = input_data.NiftiMasker()
    data = masker.fit_transform(nii)
    ds = to_time_series_dataset(data.T[::80, :])

    model = TimeSeriesKMeans(n_clusters=2, metric="dtw", max_iter=15)
    model.fit(ds)

    all = to_time_series_dataset(data.T)

    mask = model.predict(all)
    mask_nii = masker.inverse_transform(mask)
    mask.nii.to_filename(args.output)
def get_fitted_model(dataset,
                     clusters=3,
                     start=0,
                     end=0,
                     verbose=False,
                     return_dataset=True):
    if end != 0 or start != 0:
        dataset = dataset[:, start:end]
    if verbose: print(f'Segmenting data into {clusters} clusters...')
    metric_params = {'global_constraint': 'sakoe_chiba'}
    model = TimeSeriesKMeans(
        n_clusters=clusters,
        n_init=1,
        metric='dtw',  #Dynamic time warping
        verbose=verbose,
        n_jobs=-1,  #Use all cores
        metric_params=metric_params)
    model.fit(dataset)
    return model
Ejemplo n.º 10
0
def multi_plot(row, col, fs_tuple, sy_bool, sx_bool, X, num_cluster, lineW, ts=False, labels = None):

    if ts==True:
        # model generation
        model=TimeSeriesKMeans(n_clusters = num_cluster, tol=1e-05, metric='euclidean')
        fitted_model = model.fit(X)
        labels = fitted_model.predict(X)

    f, axes = plt.subplots(row, col, figsize=fs_tuple, sharey=sy_bool, sharex=sx_bool)


    labelsize=10
    fontsize=10

    cluster_pool = np.unique(labels)
    for index, i_cluster in enumerate(cluster_pool):
        sub_mat = X[labels==i_cluster, :]
        # unravel
        figrow, figcol = np.unravel_index(index, dims=[row, col])
        # plot
        if row > 1 and col > 1:

            for iCurve in range(np.shape(sub_mat)[0]):
                axes[figrow,figcol].plot(sub_mat[iCurve,:], 'r', linewidth=lineW)
            # after plot, modify the axes
            for i_col in range(col):
                axes[-1,i_col].set_xticks([0,16,32,48,64,80])
                axes[-1,i_col].set_xticklabels(str(300+80*(i)) for i in np.arange(6))
                axes[-1,i_col].set_xlabel('Wavelength [nm]', fontsize=fontsize)
                axes[-1,i_col].tick_params(axis='x', labelsize=labelsize)
            for i_row in range(row):
                axes[i_row,0].set_yticks([-1,0,1])
                axes[i_row,0].tick_params(axis='y', labelsize=labelsize)
        elif row > 1 and col == 1:
            for i_curve in range(np.shape(sub_mat)[0]):
                axes[figrow].plot(sub_mat[i_curve,:], 'r', linewidth=lineW)
            axes[-1,0].set_xticks([0,16,32,48,64,80])
            axes[-1,0].set_xticklabels(str(300+80*(i)) for i in np.arange(6))
            axes[-1,0].set_xlabel('Wavelength [nm]', fontsize=fontsize)
            axes[-1,0].tick_params(axis='x', labelsize=labelsize)
            for i_row in range(row):
                axes[i_row,0].set_yticks([-1,0,1])
                axes[i_row,0].tick_params(axis='y', labelsize=labelsize)
        elif row == 1 and col > 1:
            for i_curve in range(np.shape(sub_mat)[0]):
                axes[figcol].plot(sub_mat[i_curve,:], 'r', linewidth=lineW)
            for i_col in range(col):
                axes[-1,i_col].set_xticks([0,16,32,48,64,80])
                axes[-1,i_col].set_xticklabels(str(300+80*(i)) for i in np.arange(6))
                axes[-1,i_col].set_xlabel('Wavelength [nm]', fontsize=fontsize)
                axes[-1,i_col].tick_params(axis='x', labelsize=labelsize)
            axes[0,0].set_yticks([-1,0,1])
            axes[0,0].tick_params(axis='y', labelsize=labelsize)
    return (f, axes)
Ejemplo n.º 11
0
def calc_kmeans(df_scaled, metric, n_clusters, name):
    file_name = 'models/ts_{}_{}.pickle'.format(name, n_clusters)
    if not path.exists(file_name):
        ts_kmeans = TimeSeriesKMeans(n_clusters=n_clusters,
                                     metric=metric,
                                     n_jobs=20,
                                     max_iter=10)
        ts_kmeans.fit(df_scaled)
        with open(file_name, 'wb') as f:
            pickle.dump(ts_kmeans, f)
    else:
        ts_kmeans = pickle.load(open(file_name, 'rb'))

    for cluster_number in range(n_clusters):
        plt.plot(ts_kmeans.cluster_centers_[cluster_number, :, 0].T,
                 label=cluster_number)
    plt.title("Cluster centroids")
    plt.legend()
    plt.show()
    return ts_kmeans
Ejemplo n.º 12
0
def find_kmeans(df_scaled, metric, clasters):
    distortions = []
    silhouette = []
    daviesbouldin = []
    K = range(1, clasters)
    for k in tqdm(K):
        kmeanModel = TimeSeriesKMeans(n_clusters=k,
                                      metric=metric,
                                      n_jobs=20,
                                      max_iter=10)
        #kmeanModel = TimeSeriesKMeans(n_clusters=k, metric="euclidean", n_jobs=6, max_iter=10)
        kmeanModel.fit(df_scaled)
        distortions.append(kmeanModel.inertia_)
        if k > 1:
            silhouette.append(silhouette_score(df_scaled, kmeanModel.labels_))
            daviesbouldin.append(
                davies_bouldin_score(df_scaled, kmeanModel.labels_))

    plt.figure(figsize=(10, 4))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('Elbow Method')
    plt.show()

    plt.figure(figsize=(10, 4))
    plt.plot(K[1:], silhouette, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Silhouette score')
    plt.title('Silhouette')
    plt.show()

    plt.figure(figsize=(10, 4))
    plt.plot(K[1:], daviesbouldin, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Davies-Bouldin score')
    plt.title('Davies-Bouldin')
    plt.show()
Ejemplo n.º 13
0
def tsKMeans_num_cluster(X, n_trials, max_n_cluster):
    min_n_cluster = 2
    v_clusters = np.arange(min_n_cluster, max_n_cluster)
    n_seeds = n_trials
        # recorder
    sc_recorder = np.zeros((len(v_clusters),n_seeds))
    for i_seed in range(n_seeds):
        for num_cluster in v_clusters:
            model=TimeSeriesKMeans(n_clusters = num_cluster, tol=1e-05, metric='euclidean',    random_state=i_seed)
            fitted_model = model.fit(X)
            y_pred= fitted_model.predict(X)
            s_sc = sklearn.metrics.silhouette_score(X, y_pred, metric='euclidean')
            sc_recorder[num_cluster-min_n_cluster, i_seed]=s_sc
    return sc_recorder
Ejemplo n.º 14
0
def construct_model(best_num_cluster: int, df: pd.DataFrame):
    """
    Function that constructs the model, tests the accuracy of the model and produces cluster plots using the model
    Args:
        pd.DataFrame: Columns are (country(index), year_week, value)
        int: Best number of cluster 
    Returns:
        model 

    """
    import numpy as np
    from tslearn.clustering import silhouette_score
    import matplotlib.pyplot as plt

    model = TimeSeriesKMeans(n_clusters=best_num_cluster,
                             metric="softdtw",
                             max_iter=50)

    #### # Test whether the clusters are stable or not
    model1 = TimeSeriesKMeans(n_clusters=best_num_cluster,
                              metric="softdtw",
                              max_iter=50)
    model1.fit(df.values[...,
                         np.newaxis])  # we need the output to be visualized

    model2 = TimeSeriesKMeans(n_clusters=best_num_cluster,
                              metric="softdtw",
                              max_iter=50)
    model2.fit(df.values[...,
                         np.newaxis])  # we need the output to be visualized
    print(
        normalized_mutual_info_score(model1.labels_, model2.labels_)
    )  # scale the results between 0 (no mutual information) and 1 (perfect correlation)

    ###### Plotting the model ######

    model.fit(df.values[..., np.newaxis])
    plt.figure()
    sz = newcases_df.shape[1]
    ylim = newcases_df.values.max()
    for yi in range(best_num_cluster):
        plt.subplot(best_num_cluster, best_num_cluster, yi + 1)
        #for xx in X_train[y_pred == yi]:
        #plt.plot(xx.ravel(), "k-", alpha=.2)
        plt.plot(model.cluster_centers_[yi].ravel(), "r-")
        plt.xlim(0, sz)
        plt.ylim(0, ylim)
        plt.text(0.55,
                 0.85,
                 'Cluster %d' % (yi + 1),
                 transform=plt.gca().transAxes)
        if yi == 1:
            plt.title("DTW $k$-means")

    return model
Ejemplo n.º 15
0
        def kmeans(n_shapelets, shp_len, n_draw=1000):
            """Sample subseries from the timeseries and apply K-Means on them"""
            # Sample `n_draw` subseries of length `shp_len`
            n_ts, sz = len(X), self._min_length
            indices_ts = np.random.choice(n_ts, size=n_draw, replace=True)
            start_idx = np.random.choice(sz - shp_len + 1,
                                         size=n_draw,
                                         replace=True)
            end_idx = start_idx + shp_len

            subseries = np.zeros((n_draw, shp_len))
            for i in range(n_draw):
                subseries[i] = X[indices_ts[i]][start_idx[i]:end_idx[i]]

            tskm = TimeSeriesKMeans(n_clusters=n_shapelets,
                                    metric="euclidean",
                                    verbose=False)
            return tskm.fit(subseries).cluster_centers_
Ejemplo n.º 16
0
def kmeans(X, n_shapelets, min_len, max_len, n_draw=None):
    """Sample subseries from the timeseries and apply K-Means on them"""
    # Sample `n_draw` subseries of length `shp_len`
    if n_shapelets == 1:
        return random_shapelet(X, n_shapelets, min_len, max_len)
    if n_draw is None:
        n_draw = max(n_shapelets, int(np.sqrt(len(X))))
    shp_len = np.random.randint(4, min(min_len, max_len))
    indices_ts = np.random.choice(len(X), size=n_draw, replace=True)
    start_idx = np.random.choice(min_len - shp_len, size=n_draw, replace=True)
    end_idx = start_idx + shp_len

    subseries = np.zeros((n_draw, shp_len))
    for i in range(n_draw):
        subseries[i] = X[indices_ts[i]][start_idx[i]:end_idx[i]]

    tskm = TimeSeriesKMeans(n_clusters=n_shapelets,
                            metric="euclidean",
                            verbose=False)
    return tskm.fit(subseries).cluster_centers_
Ejemplo n.º 17
0
def subseqeuence_clustering(sequence, changepoints, y_label='y', norm=False):
    """
    Clusters subsequences of time series indicated by the changepoints variable.
    Uses silhouette score to determine the number of clusters
    :param y_label: Name of y-label in plot
    :param norm: normlise data using MinMaxScaler
    :param sequence: np array of the time series
    :param changepoints: detected changepoints on which subseuqences are build
    :return:
    """
    from tslearn.clustering import TimeSeriesKMeans, silhouette_score
    from tslearn.utils import to_time_series_dataset
    from tslearn.preprocessing import TimeSeriesScalerMinMax

    sub_ids = []
    x_index = []
    X = []
    i = 0
    end_p = [len(sequence) - 1]
    for cp in changepoints + end_p:
        X.append(sequence[i:cp])
        index = 'sub_' + str(i) + '_' + str(cp)
        sub_ids.append(index)
        x_index.append([x_id for x_id in range(i, cp + 1)])
        i = cp

    # Normalize the data (y = (x - min) / (max - min))
    if norm:
        X = TimeSeriesScalerMinMax().fit_transform(X)
    X = to_time_series_dataset(X)
    #  Find optimal # clusters by
    #  looping through different configurations for # of clusters and store the respective values for silhouette:
    sil_scores = {}
    for n in range(2, len(changepoints)):
        model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10)
        model_tst.fit(X)
        sil_scores[n] = (silhouette_score(X,
                                          model_tst.predict(X),
                                          metric="dtw"))

    opt_k = max(sil_scores, key=sil_scores.get)
    print('Number of Clusters in subsequence clustering: ' + str(opt_k))
    model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10)
    labels = model.fit_predict(X)
    print(labels)

    # build helper df to map metrics to their cluster labels
    df_cluster = pd.DataFrame(list(zip(sub_ids, x_index, model.labels_)),
                              columns=['metric', 'x_index', 'cluster'])
    cluster_metrics_dict = df_cluster.groupby(
        ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()

    print('Plotting Clusters')
    #  plot changepoints as vertical lines
    for cp in changepoints:
        plt.axvline(x=cp, ls=':', lw=2, c='0.65')
    #  preprocessing for plotting cluster based
    x_scat = []
    y_scat = []
    cluster = []
    for index, row in df_cluster.iterrows():
        x_seq = row['x_index']
        x_scat.extend(x_seq)
        y_seq = sequence[x_seq[0]:x_seq[-1] + 1]
        y_scat.extend(y_seq)
        label_seq = [row['cluster']]
        cluster.extend(label_seq * len(x_seq))
        # plt.scatter(x_seq, y_seq, label=label_seq)
    # plotting cluster based
    x_scat = np.array(x_scat)
    y_scat = np.array(y_scat)
    for c in np.unique(cluster):
        i = np.where(cluster == c)
        plt.scatter(x_scat[i], y_scat[i], label=c)
    plt.legend()
    plt.title('Subsequence k-means Clustering')
    plt.xlabel('Time index')
    plt.ylabel(y_label)
    plt.show()

    return cluster_metrics_dict
Ejemplo n.º 18
0
model = TimeSeriesKMeans(n_clusters=2, metric='dtw', verbose=1)

fig, axes = plt.subplots(1, 3)
for axis in axes:
    axis.set_xlabel('Time')
    axis.set_ylabel('Signal')
    axis.set_ylim((-1.1, 1.1))

for datum in data[0:5]:
    axes[0].plot(datum, color='green')

for datum in data[300:305]:
    axes[1].plot(datum, color='red')
axes[0].set_title('Examples of training data')
axes[1].set_title('Examples of training data')
model.fit(data)
sines_cluster_1 = 0
tris_cluster_1 = 0
sines_cluster_2 = 0
tris_cluster_2 = 0

#this comparator is broken!!
for label in model.labels_[:300]:
    if label:
        sines_cluster_1 += 1
    else:
        tris_cluster_1 += 1
for label in model.labels_[300:]:
    if label:
        sines_cluster_2 += 1
    else:
Ejemplo n.º 19
0
class SKU_Clusterer:
    def __init__(self, *args, **kwargs):
#clustering params
        self.classifier=None
        self.clusters_indices = {}
        self.n_clusters = int(kwargs['n_clusters'])
        self.use_kmeans = self.n_clusters > 0
        self.kmeans_iterations = int(kwargs['k_means_n_iterations'])
        self.k_means_metric = kwargs.get('k_means_metric', 'euclidean')
        if self.k_means_metric not in ['dtw', 'euclidean', 'softdtw']:
            print('invalid k_means metric, seting to `euclidean`', verbosity=1)
            self.k_means_metric = 'euclidean'
#RNN params
        self.n_epochs = [int(p) for p in (kwargs['rnn_epochs'].split(';'))]
        self.n_steps = [int(p) for p in (kwargs['n_steps']).split(';')][0]
        self.encoder_output_units = [int(p) for p in kwargs['encoder_output_units'].split(';')]
        self.decoder_output_units = [int(p) for p in kwargs['decoder_output_units'].split(';')]
        self.batch_size = [int(p) for p in kwargs['batch_size'].split(';')]
        self.early_stopping = [kwargs['early_stopping'].split(';')]
        self.discriminative_cols = kwargs.get('discriminative_columns', None)
        if self.discriminative_cols: self.discriminative_cols = self.discriminative_cols.strip().split(';')
#paths
        self.sku_path = kwargs['sku_path']
        self.autoencoder_path = './models/autoencoder.pkl'
        self.encoder_path = './models/encoder.pkl'
        self.decoder_path = './models/decoder.pkl'
        self.classifier_path = './models/classifier.pkl'
        self.kmeans_path = './models/kmeans_model.pkl'
        self.embedder_path = './models/embedder.pkl'
        self.config_path = './models/clusterer_config.pkl'
#other params
        self.full_dataset = kwargs.get('full_dataset', False)
        self.cold_start = True if kwargs['cold_start'] == 'True' else False
        self.encoding = kwargs.get('encoding', 'utf8')
        self._load_datasets = self._load_datasets_full if self.full_dataset == 'True' else self._load_datasets_partial

        if not self.cold_start:
            self.load_configuration()

    def filter_dataset(self, df):
        chosen_cols = []
        for c in self.discriminative_cols:
                if c not in df.columns:
                    print(f'invalid column name: `{c}`, omitting...', verbosity=1)
                else:
                    chosen_cols.append(c)
        self.discriminative_cols = chosen_cols
        if self.discriminative_cols != []:
            print(f'RUNNING FILTERING on columns:{", ".join(self.discriminative_cols)}')
            df = df.filter(items = self.discriminative_cols)
        else:
            print('No discriminative columns passed, running algoritm on all columns')
        return df
        
    def _load_datasets_partial(self):
        datasets = []
        for file in os.listdir(self.sku_path):
            df = pd.read_csv(os.path.join(self.sku_path, file),
                                               encoding=self.encoding,
                                               sep=';')
            df = self.filter_dataset(df)
            n_splits = df.shape[0] // self.n_steps
            trim = df.shape[0] % self.n_steps
            df = df[trim:]
            for split_idx in range(n_splits):
                chunk = df[split_idx * self.n_steps : (split_idx + 1) * self.n_steps]
                datasets.append(chunk.values)
        return np.array(datasets, dtype=np.float64)
    
    def _load_datasets_full(self):
        datasets = []
        for file in os.listdir(self.sku_path):
            df = pd.read_csv(os.path.join(self.sku_path, file),
                                               encoding=self.encoding,
                                               sep=';')
            df = self.filter_dataset(df)
            for offset in range(df.shape[0] - self.n_steps):
                chunk = df[offset : offset + self.n_steps]
                datasets.append(chunk.values)
        return np.array(datasets, dtype=np.float64)
        
    def load_configuration(self):
        if not os.path.exists(self.config_path):
            print('Config file not found...', verbosity=1)
            return
        config = open(self.config_path, "rb")
        self.clusters_indices = load(config)
        self.n_clusters = load(config)
        self.use_kmeans = load(config)
        self.train_dataset = load(config)
        config.close()
        
    def save_configuration(self):
        config = open(self.config_path, "wb")
        dump(self.clusters_indices, config)
        dump(self.n_clusters, config)
        dump(self.use_kmeans, config)
        dump(self.train_dataset, config)
        config.close()


    def load_models(self, cold_start=False):
        models_exists = os.path.isfile(self.autoencoder_path) \
                        and os.path.isfile(self.encoder_path) \
                        and os.path.isfile(self.decoder_path)
        classifier_exists = os.path.isfile(self.classifier_path)
        kmeans_exists = os.path.isfile(self.kmeans_path)
        embedder_exists = os.path.isfile(self.embedder_path)
        if not (models_exists and classifier_exists):
            print('NO MODELS FOUND, COLD START REQUIRED...', verbosity=1)
        if not cold_start and models_exists:
            print('AUTOENCODER MODELS EXISTS, LOADING...')
            self.autoenc = load_model(self.autoencoder_path)
            self.enc = load_model(self.encoder_path)
            self.dec = load_model(self.decoder_path)
        if not cold_start and classifier_exists:
            print('CLASSIFIER MODEL EXISTS, LOADING...')
            with open(self.classifier_path, 'rb') as model_file:
                self.classifier = load(model_file)
        if not cold_start and kmeans_exists:
            print('K_MEANS MODEL EXISTS, LOADING...')
            with open(self.kmeans_path, 'rb') as model_file:
                self.k_means_classifier = load(model_file)
        if not cold_start and embedder_exists:
            with open(self.embedder_path, 'rb') as model_file:
                self.embedder = load(model_file)
        return models_exists and classifier_exists and embedder_exists and not cold_start

    def train(self, dataset=None):
        if dataset is None:
            dataset = self._load_datasets()
        n_features = dataset.shape[-1]
        if not self.load_models(self.cold_start):
            #Talos scan
            params = {
                        'n_steps':[self.n_steps],
                        'n_features':[n_features],
                        'epochs':self.n_epochs,
                        'enc_units':self.encoder_output_units,
                        'dec_units':self.decoder_output_units,
                        'batch_size':self.batch_size,
                        'early_stopping':self.early_stopping,
                        'scan':[True]
                    }
            results = talos.Scan(dataset, np.zeros_like(dataset), params=params, model=create_autoencoder_models)
            best_params = results.data.sort_values(by=['val_loss'], ascending=True).iloc[0].to_dict()
            best_params['scan'] = False
            print('\n', '='*30,
                  '\nBEST AUTOENCODER HYPERPARAMETERS:\n', 
                  '\n'.join([f'{key} = {value}' for key,value in best_params.items()]),
                  '\n',
                  '='*30)
            self.autoenc, self.enc, self.dec = create_autoencoder_models(dataset, np.zeros_like(dataset), params=best_params)
            hist = self.autoenc.history.history
            loss = hist['loss']
            val_loss = hist['val_loss']
            plt.figure(figsize=(10, 7))
            plt.plot(loss, label='training_loss')
            plt.plot(val_loss, label='validation_loss')
            plt.legend()
            plt.title('Autoencoder loss')
            plt.savefig('./loss/autoencoder_loss.png')
            self.train_dataset = dataset
            classifier_inputs = self.enc.predict(dataset)
            self.embedder = TSNE(n_components=2, perplexity=40, random_state=42)
            embedded = self.embedder.fit_transform(classifier_inputs)
            
            if not self.use_kmeans:
                print('CLUSTER COUNT NOT SPECIFIED, CALCULATING CLUSTER NUMBER...', verbosity=1)
                self.u_classifier = DBSCAN(eps=3, n_jobs=-1)
                classes = self.u_classifier.fit_predict(embedded)
                self.n_clusters = len(set(classes)) 
                self.use_kmeans = True
            self.k_means_classifier = TimeSeriesKMeans(n_clusters=self.n_clusters, 
                                           metric=self.k_means_metric, 
                                           n_init=self.kmeans_iterations,
                                           verbose=True,
                                           max_iter=1000)
            self.k_means_classifier.fit(embedded)
            self.k_means_classifier.transform = self.k_means_classifier.predict #hotfix
            self.clusters_indices = self.k_means_classifier.fit_predict(embedded)
            
            self.classifier = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
            self.classifier.fit(embedded, self.clusters_indices)
            
            with open(self.classifier_path, 'wb') as model_file:
                dump(self.classifier, model_file)
            with open(self.embedder_path, 'wb') as model_file:
                dump(self.embedder, model_file)
            with open(self.kmeans_path, 'wb') as model_file:
                dump(self.k_means_classifier, model_file)
            
            self.save_configuration()

# =============================================================================
# Cluster visualisation
# =============================================================================
            clusters = self.k_means_classifier.transform(embedded)
            unique_clusters = set(clusters)
            plt.figure()
            for clas in unique_clusters:
        #    for clas in unique_clusters:
                c = generate_color()
                mask = clusters == clas
                filtered = embedded[mask]
                plt.scatter(filtered[:, 0], filtered[:, 1], c=c, label=f'cluster {clas + 1}')
            plt.legend()
            plt.savefig('./clusters/clusters.png')


    def embed(self, dataset):
        flattened = self.enc.predict(dataset)
        embedded = self.embedder.fit_transform(flattened)
        return embedded
        
    def predict(self, sample):
        result = self.enc.predict(sample)
        return result
    
    def predict_class(self, sample, plot_cluster=False):
        extended_dataset = np.vstack(( self.train_dataset, sample.reshape(-1, *sample.shape) ))
        embedded_space = self.embed(extended_dataset)
        sample_coords = embedded_space[-1]
        nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(embedded_space[:-1])
        distances, indices = nbrs.kneighbors(sample_coords.reshape(1, -1))    
        n_classes, classes_counts = np.unique(self.clusters_indices[indices], return_counts = True)
        cls = n_classes[np.argmax(np.unique(classes_counts))]
        print(distances)
        print(indices)
        print(self.clusters_indices[indices])
        print(cls)
        if plot_cluster:
            plt.figure()
            plt.scatter(embedded_space[:,0], embedded_space[:,1])
            plt.scatter(sample_coords[0], sample_coords[1], marker='x', c='red')
        return cls, distances, indices
    
    def compress_dataset(self, dataset):
        return self.enc.predict(dataset)
    
    def cluster(self, dataset, sample=None, plot_clusters=False):
        if sample is not None:
            dataset = np.vstack((sample, dataset))
        compressed_dataset = self.compress_dataset(dataset)
        embedded_dataset = self.embedder.fit_transform(compressed_dataset)
        classes = self.k_means_classifier.fit_predict(embedded_dataset)
        
        if plot_clusters:
            plt.figure()
            unique_clusters = set(classes)
            for clas in unique_clusters:
                c = generate_color()
                mask = classes == clas
                filtered = embedded_dataset[mask]
                plt.scatter(filtered[:, 0], filtered[:, 1], c=c, label=f'cluster {clas + 1}')
            if sample is not None:
                plt.scatter(embedded_dataset[0, 0], embedded_dataset[0, 1], c='red', marker='x')
            plt.legend()
            
        return dataset, classes
Ejemplo n.º 20
0
#y_kmeans = kmeans.predict(my_array)
#plt.scatter(my_array[:,0],my_array[:,3],c = y_kmeans, s=50, cmap='viridis')
#plt.show()
centroids = kmeans.cluster_centers_
labels = kmeans.labels_
#print(my_array[500,3])
print(centroids)
print(len(labels))'''

no_clust = 10
t_series = to_time_series(my_array)
kmeans = TimeSeriesKMeans(n_clusters=no_clust,
                          metric="euclidean",
                          max_iter=8,
                          random_state=0)
kmeans.fit(t_series)
print("The cluster centers are:", kmeans.cluster_centers_)
print("Each time series belongs to:", kmeans.labels_)
labels = kmeans.labels_

y_kmeans = kmeans.predict(t_series)
plt.scatter(t_series[:, 0, 1], [2 for _ in range(length)],
            c=y_kmeans,
            s=30,
            cmap='viridis')
plt.scatter(t_series[:, 182, 1], [1.5 for _ in range(length)],
            c=y_kmeans,
            s=30,
            cmap='viridis')
plt.scatter(t_series[:, 364, 1], [1 for _ in range(length)],
            c=y_kmeans,
Ejemplo n.º 21
0
def k_means_clustering(sd_log):
    """
    k_means clustering of all features using dtw for multivariate time series
    :param sd_log: sd_log object
    :return: cluster_metrics_dict: dict with clusters as key and features as values
    """
    from tslearn.clustering import TimeSeriesKMeans, silhouette_score
    from tslearn.utils import to_time_series_dataset
    from tslearn.preprocessing import TimeSeriesScalerMinMax

    data = sd_log.data
    # TODO handle outliers
    tmp = sd_log.waiting_time
    data.drop(columns=[sd_log.waiting_time], inplace=True)
    X = []
    # Get data as numpy array
    for col in data.columns:
        X.append(sd_log.get_points(col))

    # Normalize the data (y = (x - min) / (max - min))
    data_norm = data.copy()
    for column in data_norm.columns:
        data_norm[column] = (data_norm[column] - data_norm[column].min()) / (
            data_norm[column].max() - data_norm[column].min())

    X = TimeSeriesScalerMinMax().fit_transform(X)
    X = to_time_series_dataset(X)

    #  Find optimal # clusters by
    #  looping through different configurations for # of clusters and store the respective values for silhouette:
    sil_scores = {}
    for n in range(2, len(data.columns)):
        model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10)
        model_tst.fit(X)
        sil_scores[n] = (silhouette_score(X,
                                          model_tst.predict(X),
                                          metric="dtw"))

    opt_k = max(sil_scores, key=sil_scores.get)
    model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10)
    labels = model.fit_predict(X)
    print(labels)

    # build helper df to map metrics to their cluster labels
    df_cluster = pd.DataFrame(list(zip(data.columns, model.labels_)),
                              columns=['metric', 'cluster'])

    # make some helper dictionaries and lists
    cluster_metrics_dict = df_cluster.groupby(
        ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()
    cluster_len_dict = df_cluster['cluster'].value_counts().to_dict()
    clusters_dropped = [
        cluster for cluster in cluster_len_dict
        if cluster_len_dict[cluster] == 1
    ]
    clusters_final = [
        cluster for cluster in cluster_len_dict
        if cluster_len_dict[cluster] > 1
    ]

    print('Plotting Clusters')

    fig, axs = plt.subplots(opt_k)  # , figsize=(10, 5))
    # fig.suptitle('Clusters')
    row_i = 0
    # column_j = 0
    # For each label there is,
    # plots every series with that label
    for cluster in cluster_metrics_dict:
        for feat in cluster_metrics_dict[cluster]:
            axs[row_i].plot(data_norm[feat], label=feat, alpha=0.4)
            axs[row_i].legend(loc="best")
        if len(cluster_metrics_dict[cluster]) > 100:
            # TODO draw mean in red if more than one cluster
            tmp = np.nanmean(np.vstack(cluster), axis=1)
            axs[row_i].plot(tmp, c="red")
        axs[row_i].set_title("Cluster " + str(cluster))
        row_i += 1
        # column_j += 1
        # if column_j % k == 0:
        #    row_i += 1
        #    column_j = 0
    plt.show()

    # return dict {cluster_id: features}
    return cluster_metrics_dict
Ejemplo n.º 22
0
        D1, D2, A2 = DWT_db2(data)
        DWTed_test.append(D1); DWTed_test.append(D1); DWTed_test.append(A2)
        test.append(data[0:1000].to_numpy().reshape(-1, 1))
        test.append(data[1024:2048].to_numpy().reshape(-1, 1))
        test.append(data[2048:3072].to_numpy().reshape(-1, 1))
        test.append(data[3072:4096].to_numpy().reshape(-1, 1))
#DWTed_test = random.sample(DWTed_test, len(DWTed_test))
#test = random.sample(test, len(test))

"""EEG signals classification using the K-means clustering and a multilayer
perceptron neural network model (Umut Orhan 2011)
"""

#K-means clustering:
model = TimeSeriesKMeans(n_clusters=2, metric="softdtw", max_iter = 5)
model.fit(np.array(train))

pred = model.predict(np.array(test))
pred

a = np.zeros((320,), dtype=int)
b = np.ones((80,), dtype=int)
true = np.concatenate([a, b])

confusion_matrix(true, pred)

centers = model.cluster_centers_
centers = np.array([centers[0].flatten(), centers[1].flatten()])
centers

plt.plot(centers[0], color = 'red')
Ejemplo n.º 23
0
        y_train = data_train[:, 0].astype(np.int)
        X_test = to_time_series_dataset(data_test[:, 1:])
        y_test = data_test[:, 0].astype(np.int)

        X_train = TimeSeriesScalerMeanVariance(mu=0., std=1.) \
            .fit_transform(X_train)
        X_test = TimeSeriesScalerMeanVariance(mu=0., std=1.) \
            .fit_transform(X_test)
        classes = len(np.unique(data_train[:, 0]))
        km = TimeSeriesKMeans(n_clusters=5,
                              max_iter=10,
                              n_init=10,
                              metric="euclidean",
                              verbose=0,
                              random_state=2019)
        km.fit(X_train)

        print(i, file=f)
        preds = km.predict(X_train)
        ars = adjusted_rand_score(data_train[:, 0], preds)
        print("Adjusted Rand Index on Training Set:", ars, file=f)
        kMeansDF.loc[i, "Train ARS"] = ars

        preds_test = km.predict(X_test)
        ars = adjusted_rand_score(data_test[:, 0], preds_test)
        print("Adjusted Rand Index on Test Set:", ars, file=f)
        kMeansDF.loc[i, "Test ARS"] = ars
        print(file=f)
    kMeansTime = timer.elapsed_time()

    print("Time to Run k-Means Experiment in Minutes:",
Ejemplo n.º 24
0
def time_clustering(state_df,
                    day_0,
                    days_before=30,
                    date_col='date',
                    region_col='county',
                    target_col='cases'):
    """
    input:
    
    state_df: pandas dataframe that contains the data
    day_0: int first day of prediction
    days_before: how many days before day_0, you will use data for time clustering
    
    output:
    
    clusters: list of lists of clusters
    
    """

    cluster_state_df = state_df.copy()
    cluster_state_df['GrowthRate'] = (
        state_df.groupby(region_col)[target_col].shift(0) /
        state_df.groupby(region_col)[target_col].shift(1) - 1)

    cluster_state_df = get_in_date_range(cluster_state_df,
                                         first_date=mod_date(
                                             day_0, -days_before),
                                         last_date=mod_date(day_0, 0),
                                         date_col=date_col)

    cluster_state_df = cluster_state_df.loc[:,
                                            cluster_state_df.columns.
                                            intersection([
                                                region_col, date_col,
                                                'GrowthRate'
                                            ])]

    cluster_state_df = cluster_state_df[
        ~cluster_state_df.isin([np.nan, np.inf, -np.inf]).any(1)].copy(
            deep=True)

    time_series = cluster_state_df.groupby(region_col)['GrowthRate'].apply(
        list)
    time_series_list = to_time_series_dataset([t for t in time_series])

    regions = cluster_state_df[region_col].unique().tolist()
    # number_of_regions = 4

    # inertias = []
    #
    # for k in range(1,number_of_regions,1):
    #     print("k is: ", k)
    #     model = TimeSeriesKMeans(n_clusters=k, metric="dtw", max_iter=100, dtw_inertia=True, n_jobs=-1)
    #     model.fit(time_series_list)
    #     inertias.append(model.inertia_)
    #
    #
    # kn = KneeLocator(range(1,number_of_regions,1), inertias, curve='convex', direction='decreasing')
    #
    # print("Optimal value of clusters is: ", kn.knee)
    #
    # plt.xlabel('number of clusters k')
    # plt.ylabel('Sum of squared distances')
    # plt.plot(range(1,number_of_regions,1), inertias, 'bx-')
    # plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')

    model = TimeSeriesKMeans(n_clusters=2,
                             metric="dtw",
                             max_iter=100,
                             dtw_inertia=True,
                             n_jobs=-1)
    model.fit(time_series_list)

    clusters = [[] for _ in range(0, 2, 1)]
    for i in range(len(model.labels_)):
        clusters[model.labels_[i]].append(regions[i])

    return clusters
import numpy as np
from sklearn.metrics import adjusted_rand_score, accuracy_score, adjusted_mutual_info_score

nameDataset = "Coffee"
trainFeatDataset = 0.2
testPath = "./" + nameDataset + "/" + nameDataset + ".tsv"
listOut, series, listOfClass, listForDTW = util.adaptTimeSeries(testPath)
seedTS = util.extractFeature(listOut, series, trainFeatDataset)
print("Class Found: " + str(len(seedTS.keys())))
centroid = util.getCentroid(seedTS)
X_train = util.castTimeSeries(listOut)
centroid = util.castTimeSeries(centroid)

listCentr = []
for clust in centroid:
    listCentr.append(clust)

X = np.array(listCentr, np.float64)
model = TimeSeriesKMeans(n_clusters=len(seedTS.keys()),
                         metric="dtw",
                         max_iter=10,
                         init=X)
model.fit(X_train)
groundTruth = [int(i) for i in list(series)]
print("Labels Discovered")
print(list(model.labels_))
print("Original Labels")
print(groundTruth)
print("Adjusted Rand Index")
print(adjusted_rand_score(model.labels_, groundTruth))
Ejemplo n.º 26
0
def tsKMeans_simple(X, n_cluster, random_state):
    model=TimeSeriesKMeans(n_clusters = n_cluster, tol=1e-05, metric='euclidean', random_state=random_state)
    fitted_model = model.fit(X)
    y_pred = fitted_model.predict(X)
    return y_pred
Ejemplo n.º 27
0
def run_clustering_methods(
    data,
    n_clusters,
    path_fig,
    path_out,
    hist_plot,
    cluster_plot,
):
    "run clustering method on temporal distance files, and output cluster labels and a few diagnostic plots"

    model = TimeSeriesKMeans(n_clusters=n_clusters,
                             metric="dtw",
                             random_state=seed)

    model.fit(data)

    os.chdir(path_fig)

    ax = sns.histplot(data=model.labels_, kde=True, discrete=True)

    ax.set(xlabel='DTW K-means clusters={}'.format(str(idx)))

    plt.savefig("hist-" + hist_plot + 'cluster_n-' + str(n_clusters) + ".svg",
                transparent=True,
                dpi=1200)

    plt.close("all")

    plt.figure()
    sz = data.shape[1]
    for cluster_id in range(0, max(model.labels_ + 1)):

        idx = model.labels_ == cluster_id

        data_clustered = data[np.array(idx), ]

        plt.subplot(3, 3, cluster_id + 1)
        for xx in data_clustered:
            plt.plot(xx.ravel(), "k-", alpha=.2)

        plt.plot(savgol_filter(model.cluster_centers_[cluster_id].ravel(), 7,
                               2),
                 "r-",
                 linewidth=2.5)
        plt.xlim(0, sz)
        plt.ylim(0, 1.2)
        plt.text(0.55,
                 0.85,
                 'Cluster %d' % (cluster_id),
                 transform=plt.gca().transAxes)

    plt.tight_layout()

    plt.savefig('nclus' + str(n_clusters) + cluster_plot + '.svg')

    plt.close("all")

    os.chdir(path_out)

    np.save('labels_nclus_' + str(n_clusters), model.labels_)

    return (model.labels_)