#get unix epoch time
dt['UnixTime'] = dt.index.astype(np.int64) // 10**9

dt = dt.fillna(0)

evalu = []

for k in range(10):

    km = TimeSeriesKMeans(n_clusters=k + 2,
                          verbose=True,
                          random_state=23,
                          metric="dtw")

    Y = km.fit_predict(dt.T)

    evalu.append(silhouette_score(dt.T, Y, metric="dtw"))

# 6 clusteres is best

km = TimeSeriesKMeans(n_clusters=7,
                      verbose=True,
                      random_state=23,
                      metric="dtw")

Y = km.fit_predict(dt.T)

c1 = np.where(Y == 0)[0].tolist()
c2 = np.where(Y == 1)[0].tolist()
c3 = np.where(Y == 2)[0].tolist()
Esempio n. 2
0
seed = 0
numpy.random.seed(seed)
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = X_train[y_train < 4]  # Keep first 3 classes
numpy.random.shuffle(X_train)
# Keep only 50 time series
X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50])
# Make time series shorter
X_train = TimeSeriesResampler(sz=40).fit_transform(X_train)
sz = X_train.shape[1]

# Euclidean k-means
print("Euclidean k-means")
km = TimeSeriesKMeans(n_clusters=3, verbose=True, random_state=seed)
y_pred = km.fit_predict(X_train)

plt.figure()
for yi in range(3):
    plt.subplot(3, 3, yi + 1)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55,
             0.85,
             'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("Euclidean $k$-means")
Esempio n. 3
0
class SKU_Clusterer:
    def __init__(self, *args, **kwargs):
#clustering params
        self.classifier=None
        self.clusters_indices = {}
        self.n_clusters = int(kwargs['n_clusters'])
        self.use_kmeans = self.n_clusters > 0
        self.kmeans_iterations = int(kwargs['k_means_n_iterations'])
        self.k_means_metric = kwargs.get('k_means_metric', 'euclidean')
        if self.k_means_metric not in ['dtw', 'euclidean', 'softdtw']:
            print('invalid k_means metric, seting to `euclidean`', verbosity=1)
            self.k_means_metric = 'euclidean'
#RNN params
        self.n_epochs = [int(p) for p in (kwargs['rnn_epochs'].split(';'))]
        self.n_steps = [int(p) for p in (kwargs['n_steps']).split(';')][0]
        self.encoder_output_units = [int(p) for p in kwargs['encoder_output_units'].split(';')]
        self.decoder_output_units = [int(p) for p in kwargs['decoder_output_units'].split(';')]
        self.batch_size = [int(p) for p in kwargs['batch_size'].split(';')]
        self.early_stopping = [kwargs['early_stopping'].split(';')]
        self.discriminative_cols = kwargs.get('discriminative_columns', None)
        if self.discriminative_cols: self.discriminative_cols = self.discriminative_cols.strip().split(';')
#paths
        self.sku_path = kwargs['sku_path']
        self.autoencoder_path = './models/autoencoder.pkl'
        self.encoder_path = './models/encoder.pkl'
        self.decoder_path = './models/decoder.pkl'
        self.classifier_path = './models/classifier.pkl'
        self.kmeans_path = './models/kmeans_model.pkl'
        self.embedder_path = './models/embedder.pkl'
        self.config_path = './models/clusterer_config.pkl'
#other params
        self.full_dataset = kwargs.get('full_dataset', False)
        self.cold_start = True if kwargs['cold_start'] == 'True' else False
        self.encoding = kwargs.get('encoding', 'utf8')
        self._load_datasets = self._load_datasets_full if self.full_dataset == 'True' else self._load_datasets_partial

        if not self.cold_start:
            self.load_configuration()

    def filter_dataset(self, df):
        chosen_cols = []
        for c in self.discriminative_cols:
                if c not in df.columns:
                    print(f'invalid column name: `{c}`, omitting...', verbosity=1)
                else:
                    chosen_cols.append(c)
        self.discriminative_cols = chosen_cols
        if self.discriminative_cols != []:
            print(f'RUNNING FILTERING on columns:{", ".join(self.discriminative_cols)}')
            df = df.filter(items = self.discriminative_cols)
        else:
            print('No discriminative columns passed, running algoritm on all columns')
        return df
        
    def _load_datasets_partial(self):
        datasets = []
        for file in os.listdir(self.sku_path):
            df = pd.read_csv(os.path.join(self.sku_path, file),
                                               encoding=self.encoding,
                                               sep=';')
            df = self.filter_dataset(df)
            n_splits = df.shape[0] // self.n_steps
            trim = df.shape[0] % self.n_steps
            df = df[trim:]
            for split_idx in range(n_splits):
                chunk = df[split_idx * self.n_steps : (split_idx + 1) * self.n_steps]
                datasets.append(chunk.values)
        return np.array(datasets, dtype=np.float64)
    
    def _load_datasets_full(self):
        datasets = []
        for file in os.listdir(self.sku_path):
            df = pd.read_csv(os.path.join(self.sku_path, file),
                                               encoding=self.encoding,
                                               sep=';')
            df = self.filter_dataset(df)
            for offset in range(df.shape[0] - self.n_steps):
                chunk = df[offset : offset + self.n_steps]
                datasets.append(chunk.values)
        return np.array(datasets, dtype=np.float64)
        
    def load_configuration(self):
        if not os.path.exists(self.config_path):
            print('Config file not found...', verbosity=1)
            return
        config = open(self.config_path, "rb")
        self.clusters_indices = load(config)
        self.n_clusters = load(config)
        self.use_kmeans = load(config)
        self.train_dataset = load(config)
        config.close()
        
    def save_configuration(self):
        config = open(self.config_path, "wb")
        dump(self.clusters_indices, config)
        dump(self.n_clusters, config)
        dump(self.use_kmeans, config)
        dump(self.train_dataset, config)
        config.close()


    def load_models(self, cold_start=False):
        models_exists = os.path.isfile(self.autoencoder_path) \
                        and os.path.isfile(self.encoder_path) \
                        and os.path.isfile(self.decoder_path)
        classifier_exists = os.path.isfile(self.classifier_path)
        kmeans_exists = os.path.isfile(self.kmeans_path)
        embedder_exists = os.path.isfile(self.embedder_path)
        if not (models_exists and classifier_exists):
            print('NO MODELS FOUND, COLD START REQUIRED...', verbosity=1)
        if not cold_start and models_exists:
            print('AUTOENCODER MODELS EXISTS, LOADING...')
            self.autoenc = load_model(self.autoencoder_path)
            self.enc = load_model(self.encoder_path)
            self.dec = load_model(self.decoder_path)
        if not cold_start and classifier_exists:
            print('CLASSIFIER MODEL EXISTS, LOADING...')
            with open(self.classifier_path, 'rb') as model_file:
                self.classifier = load(model_file)
        if not cold_start and kmeans_exists:
            print('K_MEANS MODEL EXISTS, LOADING...')
            with open(self.kmeans_path, 'rb') as model_file:
                self.k_means_classifier = load(model_file)
        if not cold_start and embedder_exists:
            with open(self.embedder_path, 'rb') as model_file:
                self.embedder = load(model_file)
        return models_exists and classifier_exists and embedder_exists and not cold_start

    def train(self, dataset=None):
        if dataset is None:
            dataset = self._load_datasets()
        n_features = dataset.shape[-1]
        if not self.load_models(self.cold_start):
            #Talos scan
            params = {
                        'n_steps':[self.n_steps],
                        'n_features':[n_features],
                        'epochs':self.n_epochs,
                        'enc_units':self.encoder_output_units,
                        'dec_units':self.decoder_output_units,
                        'batch_size':self.batch_size,
                        'early_stopping':self.early_stopping,
                        'scan':[True]
                    }
            results = talos.Scan(dataset, np.zeros_like(dataset), params=params, model=create_autoencoder_models)
            best_params = results.data.sort_values(by=['val_loss'], ascending=True).iloc[0].to_dict()
            best_params['scan'] = False
            print('\n', '='*30,
                  '\nBEST AUTOENCODER HYPERPARAMETERS:\n', 
                  '\n'.join([f'{key} = {value}' for key,value in best_params.items()]),
                  '\n',
                  '='*30)
            self.autoenc, self.enc, self.dec = create_autoencoder_models(dataset, np.zeros_like(dataset), params=best_params)
            hist = self.autoenc.history.history
            loss = hist['loss']
            val_loss = hist['val_loss']
            plt.figure(figsize=(10, 7))
            plt.plot(loss, label='training_loss')
            plt.plot(val_loss, label='validation_loss')
            plt.legend()
            plt.title('Autoencoder loss')
            plt.savefig('./loss/autoencoder_loss.png')
            self.train_dataset = dataset
            classifier_inputs = self.enc.predict(dataset)
            self.embedder = TSNE(n_components=2, perplexity=40, random_state=42)
            embedded = self.embedder.fit_transform(classifier_inputs)
            
            if not self.use_kmeans:
                print('CLUSTER COUNT NOT SPECIFIED, CALCULATING CLUSTER NUMBER...', verbosity=1)
                self.u_classifier = DBSCAN(eps=3, n_jobs=-1)
                classes = self.u_classifier.fit_predict(embedded)
                self.n_clusters = len(set(classes)) 
                self.use_kmeans = True
            self.k_means_classifier = TimeSeriesKMeans(n_clusters=self.n_clusters, 
                                           metric=self.k_means_metric, 
                                           n_init=self.kmeans_iterations,
                                           verbose=True,
                                           max_iter=1000)
            self.k_means_classifier.fit(embedded)
            self.k_means_classifier.transform = self.k_means_classifier.predict #hotfix
            self.clusters_indices = self.k_means_classifier.fit_predict(embedded)
            
            self.classifier = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
            self.classifier.fit(embedded, self.clusters_indices)
            
            with open(self.classifier_path, 'wb') as model_file:
                dump(self.classifier, model_file)
            with open(self.embedder_path, 'wb') as model_file:
                dump(self.embedder, model_file)
            with open(self.kmeans_path, 'wb') as model_file:
                dump(self.k_means_classifier, model_file)
            
            self.save_configuration()

# =============================================================================
# Cluster visualisation
# =============================================================================
            clusters = self.k_means_classifier.transform(embedded)
            unique_clusters = set(clusters)
            plt.figure()
            for clas in unique_clusters:
        #    for clas in unique_clusters:
                c = generate_color()
                mask = clusters == clas
                filtered = embedded[mask]
                plt.scatter(filtered[:, 0], filtered[:, 1], c=c, label=f'cluster {clas + 1}')
            plt.legend()
            plt.savefig('./clusters/clusters.png')


    def embed(self, dataset):
        flattened = self.enc.predict(dataset)
        embedded = self.embedder.fit_transform(flattened)
        return embedded
        
    def predict(self, sample):
        result = self.enc.predict(sample)
        return result
    
    def predict_class(self, sample, plot_cluster=False):
        extended_dataset = np.vstack(( self.train_dataset, sample.reshape(-1, *sample.shape) ))
        embedded_space = self.embed(extended_dataset)
        sample_coords = embedded_space[-1]
        nbrs = NearestNeighbors(n_neighbors=6, algorithm='ball_tree').fit(embedded_space[:-1])
        distances, indices = nbrs.kneighbors(sample_coords.reshape(1, -1))    
        n_classes, classes_counts = np.unique(self.clusters_indices[indices], return_counts = True)
        cls = n_classes[np.argmax(np.unique(classes_counts))]
        print(distances)
        print(indices)
        print(self.clusters_indices[indices])
        print(cls)
        if plot_cluster:
            plt.figure()
            plt.scatter(embedded_space[:,0], embedded_space[:,1])
            plt.scatter(sample_coords[0], sample_coords[1], marker='x', c='red')
        return cls, distances, indices
    
    def compress_dataset(self, dataset):
        return self.enc.predict(dataset)
    
    def cluster(self, dataset, sample=None, plot_clusters=False):
        if sample is not None:
            dataset = np.vstack((sample, dataset))
        compressed_dataset = self.compress_dataset(dataset)
        embedded_dataset = self.embedder.fit_transform(compressed_dataset)
        classes = self.k_means_classifier.fit_predict(embedded_dataset)
        
        if plot_clusters:
            plt.figure()
            unique_clusters = set(classes)
            for clas in unique_clusters:
                c = generate_color()
                mask = classes == clas
                filtered = embedded_dataset[mask]
                plt.scatter(filtered[:, 0], filtered[:, 1], c=c, label=f'cluster {clas + 1}')
            if sample is not None:
                plt.scatter(embedded_dataset[0, 0], embedded_dataset[0, 1], c='red', marker='x')
            plt.legend()
            
        return dataset, classes
def k_means_clustering(sd_log):
    """
    k_means clustering of all features using dtw for multivariate time series
    :param sd_log: sd_log object
    :return: cluster_metrics_dict: dict with clusters as key and features as values
    """
    from tslearn.clustering import TimeSeriesKMeans, silhouette_score
    from tslearn.utils import to_time_series_dataset
    from tslearn.preprocessing import TimeSeriesScalerMinMax

    data = sd_log.data
    # TODO handle outliers
    tmp = sd_log.waiting_time
    data.drop(columns=[sd_log.waiting_time], inplace=True)
    X = []
    # Get data as numpy array
    for col in data.columns:
        X.append(sd_log.get_points(col))

    # Normalize the data (y = (x - min) / (max - min))
    data_norm = data.copy()
    for column in data_norm.columns:
        data_norm[column] = (data_norm[column] - data_norm[column].min()) / (
            data_norm[column].max() - data_norm[column].min())

    X = TimeSeriesScalerMinMax().fit_transform(X)
    X = to_time_series_dataset(X)

    #  Find optimal # clusters by
    #  looping through different configurations for # of clusters and store the respective values for silhouette:
    sil_scores = {}
    for n in range(2, len(data.columns)):
        model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10)
        model_tst.fit(X)
        sil_scores[n] = (silhouette_score(X,
                                          model_tst.predict(X),
                                          metric="dtw"))

    opt_k = max(sil_scores, key=sil_scores.get)
    model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10)
    labels = model.fit_predict(X)
    print(labels)

    # build helper df to map metrics to their cluster labels
    df_cluster = pd.DataFrame(list(zip(data.columns, model.labels_)),
                              columns=['metric', 'cluster'])

    # make some helper dictionaries and lists
    cluster_metrics_dict = df_cluster.groupby(
        ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()
    cluster_len_dict = df_cluster['cluster'].value_counts().to_dict()
    clusters_dropped = [
        cluster for cluster in cluster_len_dict
        if cluster_len_dict[cluster] == 1
    ]
    clusters_final = [
        cluster for cluster in cluster_len_dict
        if cluster_len_dict[cluster] > 1
    ]

    print('Plotting Clusters')

    fig, axs = plt.subplots(opt_k)  # , figsize=(10, 5))
    # fig.suptitle('Clusters')
    row_i = 0
    # column_j = 0
    # For each label there is,
    # plots every series with that label
    for cluster in cluster_metrics_dict:
        for feat in cluster_metrics_dict[cluster]:
            axs[row_i].plot(data_norm[feat], label=feat, alpha=0.4)
            axs[row_i].legend(loc="best")
        if len(cluster_metrics_dict[cluster]) > 100:
            # TODO draw mean in red if more than one cluster
            tmp = np.nanmean(np.vstack(cluster), axis=1)
            axs[row_i].plot(tmp, c="red")
        axs[row_i].set_title("Cluster " + str(cluster))
        row_i += 1
        # column_j += 1
        # if column_j % k == 0:
        #    row_i += 1
        #    column_j = 0
    plt.show()

    # return dict {cluster_id: features}
    return cluster_metrics_dict
Esempio n. 5
0
df_cluster.sort_values(['cluster'])

# Create data frame for customer and its cluster
create_cluster_info(y_pred_ks, cols)

plot_clusters(formatted_norm_dataset, y_pred_ks, clusters, ks,
              'pgn_customer_cluster_{}.jpg'.format(id_unit_usaha))

# Kmeans clustering with DBA-DTW distance metric
clusters = 5
dba_km = TimeSeriesKMeans(n_clusters=clusters,
                          metric="dtw",
                          max_iter_barycenter=20,
                          verbose=False,
                          random_state=seed)
y_pred_dbakm = dba_km.fit_predict(formatted_norm_dataset)

# Create data frame for customer and its cluster
create_cluster_info(y_pred_dbakm, cols)

# Plot cluster
plot_clusters(formatted_norm_dataset, y_pred_dbakm, clusters, dba_km,
              "./plot_custers_KMean_DBA_DTW.jpg")

# engine2 = sqlalchemy.create_engine(
#     'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

# Session = sessionmaker(bind=engine2)
# session = Session()

# Base = declarative_base()
Esempio n. 6
0
def main(argv):
    # define global timer to obtain global execution time
    start_global = timer()
    
    # define globals variables
    global euclidean_clustered_data, \
        dtw_clustered_data, \
        soft_dtw_clustered_data, \
        k_shape_clustered_data, \
        gak_clustered_data
    
    #############################################################################################
    # Input arguments parsing
    #############################################################################################
    
    # define help message
    help_message = \
        'clustering.py -h \n\n' \
        'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \
        'by default: processing input data (without any sampling)' \
        '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \
        'options list: \n' \
        '  -c / --clusters <number_clusters>  # set number of clusters (default 3) \n\n' \
        '  -i / --ifile <input_file>          # set input filename \n' \
        '  -n / --normalise                   # normalise input data \n' \
        '  -s / --standardise                 # standardise input data \n\n' \
        '  -a / --all                         # perform all 5 implemented methods of clustering: \n' \
        '                                       euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \
        '  -E / --euclidean                   # perform euclidean k-means clustering \n' \
        '  -D / --dtw                         # perform dtw k-means clustering \n' \
        '  -S / --soft-dtw                    # perform soft-dtw k-means clustering \n' \
        '  -K / --k-shape                     # perform k-shape clustering \n' \
        '  -G / --gak                         # perform GAK k-means clustering \n'
    
    # Create new object to save arguments
    i_args = Arguments()
    
    # number of rows in plot to create correct number of subplots
    # default = 3 (raw data plus distribution histograms)
    n_rows_plot = 3
    
    # define validation rules for arguments
    try:
        opts, args = getopt.getopt(
            argv,
            "hc:i:nsaEDSKG",
            [
                "help",
                "clusters=",
                "ifile=",
                "normalise",
                "standardise",
                "all",
                "euclidean",
                "dtw",
                "soft-dtw",
                "k-shape",
                "gak"
            ]
        )
    except getopt.GetoptError:
        print(help_message)
        sys.exit(2)
    
    # parse arguments
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print(help_message)
            sys.exit()
        elif opt in ("-c", "--clusters"):
            i_args.number_clusters = arg
        elif opt in ("-i", "--ifile"):
            i_args.input_file = arg
        elif opt in ("-n", "--normalise"):
            i_args.normalise_data = True
        elif opt in ("-s", "--standardise"):
            i_args.standardise_data = True
        elif opt in ("-E", "--euclidean"):
            n_rows_plot += 1
            i_args.euclidean_clustering = True
        elif opt in ("-D", "--dtw"):
            n_rows_plot += 1
            i_args.dtw_clustering = True
        elif opt in ("-S", "--soft-dtw"):
            n_rows_plot += 1
            i_args.soft_dtw_clustering = True
        elif opt in ("-K", "--k-shape"):
            n_rows_plot += 1
            i_args.k_shape_clustering = True
        elif opt in ("-G", "--gak"):
            n_rows_plot += 1
            i_args.gak_clustering = True
        elif opt in ("-a", "--all"):
            n_rows_plot = 8
            i_args.euclidean_clustering = True
            i_args.dtw_clustering = True
            i_args.soft_dtw_clustering = True
            i_args.k_shape_clustering = True
            i_args.gak_clustering = True
    
    # normalise maximum number of subplots levels
    n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot
    
    #############################################################################################
    # Raw data processing stage
    #############################################################################################
    
    # set style to matplotlib plot
    mpl.style.use('seaborn')
    
    # set seed value and seed the generator
    seed = 0
    numpy.random.seed(seed)
    
    # import data and print first 5 rows
    raw_data = import_data()
    print(raw_data.head())
    
    # convert raw data to the format which can be used by tslearn
    # (3-d dimensional array)
    # BUILT functionality: adjust all time series to one size
    # (NaN values are appended to the shorter ones)
    formatted_data = to_time_series_dataset(raw_data)
    
    # print shape of new array
    print(formatted_data.shape)
    
    # obtain number of measuring
    n_measuring = formatted_data.shape[1]
    
    # define figure, grid_spec to create layout of the plot
    fig = plt.figure(constrained_layout=True)
    grid_spec = fig.add_gridspec(
        n_rows_plot,
        i_args.number_clusters
    )
    
    # set A4 size to figure
    fig.set_size_inches(8.5, 11.75)
    
    # setup count of layers of subplots
    count_layer = 3
    # setup first subplot and draw raw time series
    f_ax_raw_data = fig.add_subplot(grid_spec[:2, :])
    
    for xx in formatted_data:
        f_ax_raw_data.plot(xx.ravel(), alpha=.2)
    
    formatted_data_min = formatted_data.min()
    formatted_data_max = formatted_data.max()
    # draw title for chart with min and max values
    f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max))

    # obtain and print executing time of data processing stage to console,
    timer_tick = get_time_tick(start_global)
    plt.ion()
    plt.show()
    
    print("Raw data processing time: %s" % timer_tick)
    
    #############################################################################################
    # Data preprocessing stage
    #############################################################################################
    
    start = timer()
    
    # Convert NaNs to value predicted by interpolation
    # linearly interpolate for NaN/NaNs
    n_nan_changes = 0
    for ind in range(formatted_data.shape[0]):
        mask = numpy.isnan(formatted_data[ind])
        n_nan_changes += mask.sum()
        formatted_data[ind][mask] = numpy.interp(
            numpy.flatnonzero(mask),
            numpy.flatnonzero(~mask),
            formatted_data[ind][~mask]
        )
    print("%d NaN values was/were interpolated" % n_nan_changes)
    
    # Scaling
    # to know should we use normalization or standardization, we need to see
    # the distribution of values.
    
    # take random 3 measuring for each case to draw histograms
    random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False)
    
    # create new arrays with values of randomly chosen measurements
    histogram_data = formatted_data[:, random_indexes]
    
    # draw histograms
    for i_histogram in range(i_args.number_clusters):
        f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram])
        f_ax_histogram.hist(
            histogram_data[:, i_histogram],
            bins=25, density=True
        )
        
        f_ax_histogram.text(0.55, 0.98,
                            'Measurement #%d' % random_indexes[i_histogram],
                            transform=plt.gca().transAxes,
                            color="navy"
                            )
        if i_histogram == 1:
            preprocessing = ''
            if i_args.normalise_data:
                preprocessing += "normalised"
                if i_args.standardise_data:
                    preprocessing += " and standardised"
            elif i_args.standardise_data:
                preprocessing += "standardised"

            preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing
            f_ax_histogram.set_title(
                "Distributions histograms %s" % preprocessing,
                color='navy', y=1, pad=14
            )
    
    # if no processing data option chosen continue with raw data
    processed_data = formatted_data
    
    # since for this concrete challenge data the distributions are more/less
    # Gaussian/Normal we can use standardization
    
    # normalize data: Min-Max scaling ranging between 0 and 1
    if i_args.normalise_data:
        processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data)
        print("Data was normalised")
    
    # standardize data: scaling technique where the values are centered around
    # the mean with a unit standard deviation
    if i_args.standardise_data:
        processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data)
        print("Data was standardised")
    
    # obtain max value of data (to be used in visualization subplots)
    max_data = processed_data.max() * 1.2
    min_data = processed_data.min() * 1.2
    
    timer_tick = get_time_tick(start)
    print("#############################################################################################")
    print("Data processing stage elapsed time: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Euclidean k-means clustering algorithm
    #############################################################################################
    
    if i_args.euclidean_clustering:
        
        start = timer()
        print("Euclidean k-means")
        
        # define parameters of the model of the algorithm
        k_means_euclidean = TimeSeriesKMeans(
            n_clusters=i_args.number_clusters,
            verbose=True,
            random_state=seed,
            n_jobs=4
        )
        
        # calculate cluster's label array
        euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data)
        
        # draw subplots with attributed clusters of time series as well as
        # cluster centers' lines
        for i_cluster in range(i_args.number_clusters):
            f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                n_measuring, min_data, max_data,
                                                processed_data, euclidean_clustered_data, 'tab:blue')
            
            f_ax_euclidean.plot(
                k_means_euclidean.cluster_centers_[i_cluster].ravel(),
                "tab:green"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_euclidean
        
        # increment count of filled layer of subplots
        count_layer += 1
        
        # obtain processing time, print it to console and
        # add it to the title of the series of subplots
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Euclidean $k$-means (%s)" % timer_tick,
            color='tab:green', y=1, pad=14
        )
        print("#############################################################################################")
        print("Euclidean k-means time processing: %s" % timer_tick)
        
    #############################################################################################
    # Implementing DTW k-means clustering algorithm
    # use dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.dtw_clustering:
        
        start = timer()
        print("DTW k-means")
        k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                       n_init=3,
                                       metric="dtw",
                                       verbose=True,
                                       max_iter_barycenter=10,
                                       random_state=seed,
                                       n_jobs=6
                                       )
        dtw_clustered_data = k_means_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                          n_measuring, min_data, max_data,
                                          processed_data, dtw_clustered_data, 'tab:blue')
            
            f_ax_dtw.plot(
                k_means_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:red"
            )
            if i_cluster == 1:
                middle_axis = f_ax_dtw

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "DTW $k$-means (%s)" % timer_tick,
            color='tab:red', y=1, pad=14
        )
        print("#############################################################################################")
        print("DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing soft DTW k-means clustering algorithm
    # use soft dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.soft_dtw_clustering:
        
        start = timer()
        print("Soft-DTW k-means")
        k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                            metric="softdtw",
                                            metric_params={"gamma": .025},
                                            verbose=True,
                                            random_state=seed,
                                            n_jobs=6
                                            )
        soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                               n_measuring, min_data, max_data,
                                               processed_data, soft_dtw_clustered_data, 'tab:blue')
            
            f_ax_soft_dtw.plot(
                k_means_soft_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:purple"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_soft_dtw

        # increment count of filled layer of subplots
        count_layer += 1

        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Soft-DTW $k$-means (%s)" % timer_tick,
            color='tab:purple', y=1, pad=14
        )
        print("#############################################################################################")
        print("Soft-DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing k-Shape clustering algorithm
    #############################################################################################
    
    if i_args.k_shape_clustering:
        
        start = timer()
        print("K-Shape")
        k_shape = KShape(n_clusters=i_args.number_clusters,
                         verbose=True,
                         random_state=seed
                         )
        k_shape_clustered_data = k_shape.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            
            min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min())
            max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max())
            
            f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                              n_measuring, min_axe_value, max_axe_value,
                                              processed_data, k_shape_clustered_data, 'tab:blue')
            
            f_ax_k_shape.plot(
                k_shape.cluster_centers_[i_cluster].ravel(),
                "tab:orange"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_k_shape

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "$K$-Shape (%s)" % timer_tick,
            color='tab:orange', y=1, pad=14
        )
        print("#############################################################################################")
        print("K-Shape time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Global Alignment kernel k-means clustering algorithm
    # since kernel is used, there is no centroid of the cluster
    #############################################################################################
    
    if i_args.gak_clustering:
        
        start = timer()
        print("GAK-k-means")
        gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters,
                                   kernel="gak",
                                   kernel_params={"sigma": "auto"},
                                   n_init=10,
                                   verbose=True,
                                   random_state=seed,
                                   n_jobs=6
                                   )
        
        gak_clustered_data = gak_k_means.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                  n_measuring, min_data, max_data,
                                                  processed_data, gak_clustered_data, 'tab:blue')
            
            if i_cluster == 1:
                middle_axis = f_ax_gak_k_means

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Global Alignment kernel $k$-means (%s)" % timer_tick,
            color='tab:cyan', y=1, pad=14)
        print("#############################################################################################")
        print("GAK k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    
    # return string with current datetime
    now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

    # define the name of the directory to be created
    path = "./out/%s" % now

    print("#############################################################################################")
    try:
        os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
    else:
        print("Successfully created the directory %s " % path)
    
    try:
        # save figure as pdf to out folder
        fig.savefig("./out/%s/visual_result.pdf" % now)
    
        # save clustering results
        if i_args.euclidean_clustering:
            numpy.savetxt(
                "./out/%s/euclidean_clustering_result.csv" % now,
                euclidean_clustered_data,
                delimiter=","
            )
        if i_args.dtw_clustering:
            numpy.savetxt(
                "./out/%s/dtw_clustering_result.csv" % now,
                dtw_clustered_data,
                delimiter=","
            )
        if i_args.soft_dtw_clustering:
            numpy.savetxt(
                "./out/%s/soft_dtw_clustering_result.csv" % now,
                soft_dtw_clustered_data,
                delimiter=","
            )
        if i_args.k_shape_clustering:
            numpy.savetxt(
                "./out/%s/k_shape_clustering_result.csv" % now,
                k_shape_clustered_data,
                delimiter=","
            )
        if i_args.gak_clustering:
            numpy.savetxt(
                "./out/%s/gak_clustering_result.csv" % now,
                gak_clustered_data,
                delimiter=","
            )
    except RuntimeError:
        print("Saving results failed")
    else:
        print("Successfully saved results in the path %s " % path)

    #############################################################################################
    
    # obtain and print global executing time
    timer_tick = get_time_tick(start_global)
    print("#############################################################################################")
    print("All algorithms elapsed time: % s" % timer_tick)
    
    #############################################################################################

    # render and show plot
    # plt.show()
    plt.draw()
    plt.pause(0.001)
    input("Press [enter] to finish.")
    print("#############################################################################################")
Esempio n. 7
0
if args.info:
    plt.show(block=args.block)
    sys.exit(1)

# ALSO
# for each of the fgrps, we could take the average or a boxplot of the number of claims
# per month.

# ----

if args.kmeans_algo == 0:
    k_title = "Euclidean $k$-means"
    f_title = "euclidian"
    km = TimeSeriesKMeans(n_clusters=num, verbose=True, random_state=seed)
    y_pred = km.fit_predict(X)
    print(y_pred)
elif args.kmeans_algo == 1:
    k_title = "DBA"
    f_title = "DBA_k_means"
    km = TimeSeriesKMeans(n_clusters=num,
                          n_init=2,
                          metric="dtw",
                          verbose=True,
                          max_iter_barycenter=10,
                          random_state=seed)
    y_pred = km.fit_predict(X)
    print(y_pred)
elif args.kmeans_algo == 2:
    k_title = "Soft-DTW k-means"
    f_title = "soft_DTW"
Esempio n. 8
0
def mass_upload(startDate, endDate, id_unit_usaha):
    print(id_unit_usaha)
    login = ""
    password = ""
    # engine = sqlalchemy.create_engine('mysql+pymysql://energy:energy2x5=10@localhost:3306/pgn')
    engine = sqlalchemy.create_engine(
        'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    sql = " SELECT a.IDREFPELANGGAN, a.ID_UNIT_USAHA, 1 AS FSTREAMID, DATEPART(dw, a.FDATETIME) as FDAYOFWEEK, a.FHOUR, AVG(a.FDVC) as AVG_FDVC\
            FROM(SELECT IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR, SUM(FDVC) as FDVC\
                FROM amr_bridge\
                WHERE FDATETIME >= '" + startDate + "'\
                and FDATETIME < '" + endDate + "'\
                GROUP BY IDREFPELANGGAN, ID_UNIT_USAHA, FDATETIME, FHOUR) a\
            GROUP BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR\
            ORDER BY a.IDREFPELANGGAN, a.ID_UNIT_USAHA, DATEPART(dw, a.FDATETIME), a.FHOUR"

    df = pd.read_sql_query(sql, engine)
    totaldf = len(df)
    totaldf = str(totaldf)
    print('total Data: ' + totaldf)

    # rslt_df = df.loc[df['ID_UNIT_USAHA'] == '014']

    # print(startDate)
    # print('\nResult dataframe :\n', rslt_df)

    # df.to_csv('pgn_customer_cluster_v1_{}.csv'.format(id_unit_usaha), index=False)

    # df.to_hdf("amr_bridge_22122020.hdf", key='hdf5')

    # df = pd.read_hdf("amr_bridge_22122020.hdf")

    def select_data(id_unit):
        query = "ID_UNIT_USAHA == '{}'".format(id_unit_usaha)
        columns = ['FDAYOFWEEK', 'FHOUR', 'IDREFPELANGGAN', 'AVG_FDVC']

        # df = df.set_index('FDATETIME')
        df_selected = df.query(query, engine='python')[columns]
        return df_selected

    def pivot_data(df):
        # df_pivoted = df.pivot(index='FDATETIME', columns='IDREFPELANGGAN', values='FDVC')
        df_pivoted = df.pivot(index=['FDAYOFWEEK', 'FHOUR'],
                              columns='IDREFPELANGGAN',
                              values='AVG_FDVC')
        return df_pivoted

    def remove_zerocolumns(df):
        # Get all columns which have all zero values
        cols = df.columns[df.mean() == 0]
        # Drop columns which has all zero values
        df = df.drop(cols, axis=1)
        return df

    df_week1 = select_data(id_unit_usaha)
    df_week1.fillna(0.0, inplace=True)

    df_pivoted1 = pivot_data(df_week1)
    df_pivoted1.fillna(0.0, inplace=True)

    df_pivoted1 = remove_zerocolumns(df_pivoted1)
    cols = list(df_pivoted1.columns)
    df_pivoted1.head()

    # Function to plot cluster

    # def plot_clusters(ds, y_pred, n_clusters, ks, filename):
    #     plt.figure(figsize=(12, 40))
    #     for yi in range(n_clusters):
    #         plt.subplot(n_clusters, 1, 1 + yi)
    #         for xx in ds[y_pred == yi]:
    #             plt.plot(xx.ravel(), "k-", alpha=.2)
    #         plt.plot(ks.cluster_centers_[yi].ravel(), "r-")
    #         plt.xlim(0, sz)
    #         plt.ylim(-7, 7)
    #         plt.title("Cluster %d" % (yi))

    #     plt.tight_layout()
    #     plt.savefig(filename, format='jpg', dpi=300, quality=95)
    #     plt.show()

    def create_cluster_info(y_pred, cols):

        df_cluster = pd.DataFrame(y_pred.copy(),
                                  index=cols.copy(),
                                  columns=['cluster'])
        df_cluster.reset_index(inplace=True)
        df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True)

        unique_cluster = df_cluster['cluster'].unique()

        # Get ID ref based on cluster
        idrefs_list = []
        for i, x in enumerate(unique_cluster):
            idref_list = df_cluster.query(
                "cluster == {}".format(x))['idrefpelanggan'].values.tolist()
            # idrefs_list[x] = idref_list

            # Create dictionary
            idref_cluster_dict = {'cluster': x, 'idrefpelanggan': idref_list}
            idrefs_list.append(idref_cluster_dict)

        idrefs_cluster = pd.DataFrame(idrefs_list)
        return idrefs_cluster

    # def run_once(startime, totalData, _has_run=[]):
    #     if _has_run:
    #         return
    #     # print("run_once doing stuff")
    #     print(startime)
    #     endtime = time.time_ns()
    #     print(endtime)
    #     invTime = endtime-startime

    #     estTime = invTime * totalData
    #     _has_run.append(1)

    #     print(totalData)
    #     print(estTime)
    #     return estTime

    seed = 0
    np.random.seed(seed)

    # Convert data frame to list of series
    pivoted_series = []
    pivoted_columns = []
    for i, y in enumerate(cols):
        length = len(df_pivoted1[y])
        cst = df_pivoted1[y].values
        pivoted_series.append(cst)
        pivoted_columns.append(y)

        # Convert data set to standar time series format
    formatted_dataset = to_time_series_dataset(pivoted_series)
    print("Data shape: {}".format(formatted_dataset.shape))

    formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
        formatted_dataset)
    sz = formatted_norm_dataset.shape[1]
    print("Data shape: {}".format(sz))

    formatted_norm_dataset = TimeSeriesScalerMeanVariance().fit_transform(
        formatted_dataset)
    clusters = 5
    totalColumn = formatted_norm_dataset.shape[0]
    totalRow = formatted_norm_dataset.shape[1]
    totalData = totalRow * totalColumn + totalRow * clusters

    # ks = KShape(n_clusters=clusters, verbose=True, random_state=seed)
    # y_pred_ks = ks.fit_predict(formatted_norm_dataset)

    dba_km = TimeSeriesKMeans(n_clusters=clusters,
                              metric="dtw",
                              max_iter_barycenter=20,
                              verbose=False,
                              random_state=seed)
    y_pred_dbakm = dba_km.fit_predict(formatted_norm_dataset)

    formatted_norm_dataset.shape
    data = formatted_norm_dataset
    data.shape

    formatted_norm_dataset_2d = formatted_norm_dataset[:, :, 0]
    formatted_norm_dataset_2d.shape
    # pd.DataFrame(A.T.reshape(2, -1), columns=cols)

    df_normalized = pd.DataFrame(formatted_norm_dataset_2d)
    df_normalized
    # df_normalized = df_normalized.pivot()
    # formatted_norm_dataset[0]

    df_cluster = pd.DataFrame(y_pred_dbakm,
                              index=pivoted_columns,
                              columns=['cluster'])
    df_cluster.reset_index(inplace=True)
    df_cluster.rename(columns={'index': 'idrefpelanggan'}, inplace=True)
    df_cluster.sort_values(['cluster'])

    df_normalized_detail = pd.DataFrame.join(df_normalized, df_cluster)
    df_normalized_detail.to_csv("output.csv", index=False)

    # df_cluster.to_csv('pgn_customer_cluster_{}.csv'.format(
    #     id_unit_usaha), index=False)

    # Create data frame for customer and its cluster
    create_cluster_info(y_pred_dbakm, cols)

    # plot_clusters(formatted_norm_dataset, y_pred_ks, clusters, ks,
    #               'pgn_customer_cluster_{}.jpg'.format(id_unit_usaha))

    # engine2 = sqlalchemy.create_engine(
    #     'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    # Session = sessionmaker(bind=engine2)
    # session = Session()

    # Base = declarative_base()

    # class PL_CUSTOMER_CLUSTER(Base):

    #     __tablename__ = 'PL_CUSTOMER_CLUSTER'

    #     ID = Column(Integer, primary_key=True)
    #     DATE_STAMP = Column(DateTime)
    #     IDREFPELANGGAN = Column(String(30))
    #     HOUR_NUM = Column(Integer)
    #     CLUSTER_NUM = Column(Integer)
    #     HOUR_NUM = Column(Integer)
    #     FDVC_NORMALIZED = Column(Float)
    #     AREA_ID = Column(String(5))
    # startime = time.time_ns()
    # for i in range(totalColumn):

    #     idref = df_normalized_detail.iloc[i, totalRow]
    #     cluster = int(df_normalized_detail.iloc[i, totalRow+1])
    #     print("idref = " + idref)
    #     cluster_num = df_normalized_detail.iloc[i, totalRow-1]
    #     for j in range(totalRow):

    #         hour_num = df_normalized_detail.columns[j]
    #         fdvc = df_normalized_detail.iloc[i, j]

    #         sql = ""

    #         # insert into table
    #         item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=idref,
    #                                    HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc, AREA_ID=id_unit_usaha)
    #         session.add(item)

    #     # commit per id ref pelanngan
    #     session.commit()

    # engine2 = sqlalchemy.create_engine(
    #     'mssql+pyodbc://sa:[email protected]/SIPG?driver=SQL+Server')

    # Session = sessionmaker(bind=engine2)
    # session = Session()

    # Base = declarative_base()

    # class PL_CUSTOMER_CLUSTER(Base):
    #     __tablename__ = 'PL_CUSTOMER_CLUSTER'

    #     ID = Column(Integer, primary_key=True)
    #     DATE_STAMP = Column(DateTime)
    #     IDREFPELANGGAN = Column(String(30))
    #     HOUR_NUM = Column(Integer)
    #     CLUSTER_NUM = Column(Integer)
    #     HOUR_NUM = Column(Integer)
    #     FDVC_NORMALIZED = Column(Float)
    #     AREA_ID = Column(String(5))

    # df_normalized_detail

    # for i in range(clusters):
    #     print("cluster: " + str(i))
    #     CLUSTER_NAME = "CENTROID_ID" + str(i)
    #     cluster = i
    #     for j in range(totalRow):
    #         fdvc_norm = dba_km.cluster_centers_[i][j][0]
    #         hour_num = j

    #         sql = ""
    #         item = PL_CUSTOMER_CLUSTER(DATE_STAMP=startDate, IDREFPELANGGAN=CLUSTER_NAME,
    #                                    HOUR_NUM=hour_num, CLUSTER_NUM=cluster, FDVC_NORMALIZED=fdvc_norm, AREA_ID=id_unit_usaha)
    #         session.add(item)
    #         print("fdvc:" + str(fdvc_norm) + "Hour:" + str(hour_num))
    #     # commit per id ref pelanngan
    #     session.commit()
    #     print(str(j) + ", " + str(fdvc_norm))

    return totalData
begin_values = m.iloc[0]
for i in range(0, len(m.columns)):
    m.iloc[:, i] = m.iloc[:, i] / begin_values[i]

x = m.to_numpy().transpose()

clusters = 10
dba_km = TimeSeriesKMeans(n_clusters=clusters,
                          n_init=2,
                          n_jobs=24,
                          metric="dtw",
                          verbose=True,
                          max_iter_barycenter=10,
                          random_state=seed)

y = dba_km.fit_predict(x)
yy = pd.DataFrame(y.reshape(-1, 1))
yy['name'] = m.columns
yy.set_index('name', inplace=True)
yy.columns.values[0] = 'sector'

for cluster in range(clusters):
    plt.subplot(5, 2, cluster + 1)
    for xx in x[y == cluster]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    #plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-")
    #plt.xlim(0, sz)
    #plt.ylim(-4, 4)
    #plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),          transform=plt.gca().transAxes)
    #if yi == 1:
    #    plt.title("Soft-DTW $k$-means")
Esempio n. 10
0
    min_cluster = 2
    max_cluster = 21
    silhouette_score_dict = {}
    sse_dict = {}
    label_dict = {}

    silhouette_score_dict["time-series-k-means"] = []
    sse_dict["time-series-k-means"] = []
    label_dict["time-series-k-means"] = {}
    # silhouette_score_dict["k-shape"] = []
    # silhouette_score_dict["global-alignment-kernel-k-means"] = []
    for i in range(min_cluster, max_cluster):
        print(service + "-cluster:" + str(i))
        km = TimeSeriesKMeans(n_clusters=i, verbose=True)
        label = km.fit_predict(X_train)
        silhouette_score_dict["time-series-k-means"].append(
            silhouette_score(X_train, label, metric="dtw"))
        sse_dict["time-series-k-means"].append(km.inertia_)
        label_dict["time-series-k-means"][i] = label

        # km = GlobalAlignmentKernelKMeans(n_clusters=i, verbose=True)
        # label = km.fit_predict(X_train)
        # silhouette_score_dict["global-alignment-kernel-k-means"].append(silhouette_score(X_train, label, metric="dtw"))

        # km = KShape(n_clusters=i, verbose=True)
        # label = km.fit_predict(X_train)
        # silhouette_score_dict["k-shape"].append(silhouette_score(X_train, label, metric="dtw"))

    s1 = str(silhouette_score_dict)
    s2 = str(sse_dict)
Esempio n. 11
0
dishwasher_omp = result['dishwasher'].RecSignal[0:len(appliance_dict['dishwasher'])]
fridgefreezer_omp = result['fridgefreezer'].RecSignal[0:len(appliance_dict['fridgefreezer'])]
kettle_omp = result['kettle'].RecSignal[0:len(appliance_dict['kettle'])]
microwave_omp = result['microwave'].RecSignal[0:len(appliance_dict['microwave'])]

half_washer = np.zeros((result['washerdryer'].Kcoef.shape[1],result['washerdryer'].Kcoef.shape[0],1))
for i in range(result['washerdryer'].Kcoef.shape[1]):
    half_washer[i,:,0] = result['washerdryer'].Kcoef[:,i]
# half_washer += result['washerdryer'].Kcoef

dba_km = TimeSeriesKMeans(n_clusters=2,
                          n_init=2,
                          metric="dtw",
                          verbose=True,
                          max_iter_barycenter=10)
y_pred = dba_km.fit_predict(half_washer)

washer_labels = dba_km.labels_
summed_washer_clusters = {}
for g in np.unique(washer_labels):
    summed_washer_clusters[g] = np.zeros(half_washer.shape[1])
    for l in np.where(washer_labels == g):
        for i in range(len(l)):
            temp = half_washer[l[i],:,0]
            summed_washer_clusters[g] += temp
# %%
cdict = {0: 'red', 1: 'blue', 2: 'green'}
for i in range(len(half_washer)):
    plt.plot(half_washer[i,:,0], label=washer_labels[i], color=cdict[washer_labels[i]])#'blue' if labels[i] == 1 else 'green')
    plt.ylim([0,4000])
    plt.legend()
Esempio n. 12
0
class NonMyopicEarlyClassifier(ClassifierMixin, TimeSeriesBaseEstimator):
    """Early Classification modelling for time series using the model
    presented in [1]_.

    Parameters
    ----------
    n_clusters : int
        Number of clusters to form.

    base_classifier : Estimator or None
        Estimator (instance) to be cloned and used for classifications.
        If None, the chosen classifier is a 1NN with Euclidean metric.

    min_t : int
        Earliest time at which a classification can be performed on a time
        series

    lamb : float
        Value of the hyper parameter lambda used during the computation of the
        cost function to evaluate the probability
        that a time series belongs to a cluster given the time series.

    cost_time_parameter : float
        Parameter of the cost function of time. This function is of the form :
        f(time) = time * cost_time_parameter

    random_state: int
        Random state of the base estimator

    Attributes
    --------------------

    classifiers_ : list
        A list containing all the classifiers trained for the model, that is,
        (maximum_time_stamp - min_t) elements.

    pyhatyck_ : array like of shape (maximum_time_stamp - min_t, n_cluster, __n_classes, __n_classes)
        Contains the probabilities of being classified as class y_hat given
        class y and cluster ck for a trained classifier. The penultimate
        dimension of the array is associated to the true
        class of the series and the last dimension to the predicted class.


    pyck_ : array like of shape (__n_classes, n_cluster)
        Contains the probabilities of being of true class y given a cluster ck

    X_fit_dims : tuple of the same shape as the training dataset


    Examples
    --------
    >>> dataset = to_time_series_dataset([[1, 2, 3, 4, 5, 6],
    ...                                   [1, 2, 3, 4, 5, 6],
    ...                                   [1, 2, 3, 4, 5, 6],
    ...                                   [1, 2, 3, 3, 2, 1],
    ...                                   [1, 2, 3, 3, 2, 1],
    ...                                   [1, 2, 3, 3, 2, 1],
    ...                                   [3, 2, 1, 1, 2, 3],
    ...                                   [3, 2, 1, 1, 2, 3]])
    >>> y = [0, 0, 0, 1, 1, 1, 0, 0]
    >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=1000.,
    ...                                  cost_time_parameter=.1,
    ...                                  random_state=0)
    >>> model.fit(dataset, y)  # doctest: +ELLIPSIS
    NonMyopicEarlyClassifier(...)
    >>> print(type(model.classifiers_))
    <class 'dict'>
    >>> print(model.pyck_)
    [[0. 1. 1.]
     [1. 0. 0.]]
    >>> preds, pred_times = model.predict_class_and_earliness(dataset)
    >>> preds
    array([0, 0, 0, 1, 1, 1, 0, 0])
    >>> pred_times
    array([4, 4, 4, 4, 4, 4, 1, 1])
    >>> pred_probas, pred_times = model.predict_proba_and_earliness(dataset)
    >>> pred_probas
    array([[1., 0.],
           [1., 0.],
           [1., 0.],
           [0., 1.],
           [0., 1.],
           [0., 1.],
           [1., 0.],
           [1., 0.]])
    >>> pred_times
    array([4, 4, 4, 4, 4, 4, 1, 1])

    References
    ----------
    .. [1] A. Dachraoui, A. Bondu & A. Cornuejols. Early classification of time
       series as a non myopic sequential decision making problem.
       ECML/PKDD 2015
    """

    def __init__(self, n_clusters=2, base_classifier=None,
                 min_t=1, lamb=1., cost_time_parameter=1., random_state=None):
        super(NonMyopicEarlyClassifier, self).__init__()
        self.base_classifier = base_classifier
        self.n_clusters = n_clusters
        self.min_t = min_t
        self.lamb = lamb
        self.cost_time_parameter = cost_time_parameter
        self.random_state = random_state

    @property
    def classes_(self):
        if hasattr(self, 'classifiers_'):
            return self.classifiers_[self.min_t].classes_
        else:
            return None

    def fit(self, X, y):
        """
        Fit early classifier.

        Parameters
        ----------
        X : array-like of shape (n_series, n_timestamps, n_features)
            Training data, where `n_series` is the number of time series,
            `n_timestamps` is the number of timestamps in the series
            and `n_features` is the number of features recorded at each
            timestamp.

        y : array-like of shape (n_samples,)
            Target values. Will be cast to X's dtype if necessary

        Returns
        -------
        self : returns an instance of self.
        """

        X = check_array(X, allow_nd=True)
        X = check_dims(X)
        X = to_time_series_dataset(X)
        y_arr = np.array(y)
        label_set = np.unique(y_arr)

        self.cluster_ = TimeSeriesKMeans(n_clusters=self.n_clusters,
                                         random_state=self.random_state)
        if self.base_classifier is not None:
            clf = self.base_classifier
        else:
            clf = KNeighborsTimeSeriesClassifier(n_neighbors=1,
                                                 metric="euclidean")
        self.__n_classes_ = len(label_set)
        self._X_fit_dims = X.shape
        sz = X.shape[1]
        self.classifiers_ = {t: clone(clf)
                             for t in range(self.min_t, sz + 1)}
        self.pyhatyck_ = np.empty((sz - self.min_t + 1,
                                   self.n_clusters,
                                   self.__n_classes_, self.__n_classes_))
        c_k = self.cluster_.fit_predict(X)
        X1, X2, c_k1, c_k2, y1, y2 = train_test_split(
            X, c_k, y_arr,
            test_size=0.5,
            stratify=c_k,
            random_state=self.random_state
        )

        label_to_ind = {lab: ind for ind, lab in enumerate(label_set)}
        y_ = np.array([label_to_ind.get(lab, self.__n_classes_ + 1)
                       for lab in y_arr])

        vector_of_ones = np.ones((X.shape[0], ))
        self.pyck_ = coo_matrix(
            (vector_of_ones, (y_, c_k)),
            shape=(self.__n_classes_, self.n_clusters),
        ).toarray()
        self.pyck_ /= self.pyck_.sum(axis=0, keepdims=True)
        for t in range(self.min_t, sz + 1):
            self.classifiers_[t].fit(X1[:, :t], y1)
            for k in range(0, self.n_clusters):
                index = (c_k2 == k)
                if index.shape[0] != 0:
                    X2_current_cluster = X2[index, :t]
                    y2_current_cluster = y2[index]
                    y2_hat = self.classifiers_[t].predict(
                        X2_current_cluster[:, :t]
                    )
                    conf_matrix = confusion_matrix(y2_current_cluster, y2_hat,
                                                   labels=label_set)
                    # normalize parameter seems to be quite recent in sklearn,
                    # so let's do it ourselves
                    normalizer = conf_matrix.sum(axis=0, keepdims=True)
                    normalizer[normalizer == 0] = 1  # Avoid divide by 0
                    conf_matrix = conf_matrix / normalizer

                    # pyhatyck_ stores
                    # P_{t+\tau}(\hat{y} | y, c_k) \delta_{y \neq \hat{y}}
                    # elements so it should have a null diagonal because of
                    # the \delta_{y \neq \hat{y}} term
                    np.fill_diagonal(conf_matrix, 0)
                    self.pyhatyck_[t - self.min_t, k] = conf_matrix
        return self

    def get_cluster_probas(self, Xi):
        r"""Compute cluster probability :math:`P(c_k | Xi)`.

        This quantity is computed using the following formula:

        .. math::

            P(c_k | Xi) = \frac{s_k(Xi)}{\sum_j s_j(Xi)}

        where

        .. math::

            s_k(Xi) = \frac{1}{1 + \exp{-\lambda \Delta_k(Xi)}}

        with

        .. math::

            \Delta_k(Xi) = \frac{\bar{D} - d(Xi, c_k)}{\bar{D}}

        and :math:`\bar{D}` is the average of the distances between `Xi` and
        the cluster centers.

        Parameters
        ----------
        Xi: numpy array, shape (t, d)
            A time series observed up to time t

        Returns
        -------
        probas : numpy array, shape (n_clusters, )

        Examples
        --------
        >>> from tslearn.utils import to_time_series
        >>> dataset = to_time_series_dataset([[1, 2, 3, 4, 5, 6],
        ...                                   [1, 2, 3, 4, 5, 6],
        ...                                   [1, 2, 3, 4, 5, 6],
        ...                                   [1, 2, 3, 3, 2, 1],
        ...                                   [1, 2, 3, 3, 2, 1],
        ...                                   [1, 2, 3, 3, 2, 1],
        ...                                   [3, 2, 1, 1, 2, 3],
        ...                                   [3, 2, 1, 1, 2, 3]])
        >>> y = [0, 0, 0, 1, 1, 1, 0, 0]
        >>> ts0 = to_time_series([1, 2])
        >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=0.,
        ...                                  random_state=0)
        >>> probas = model.fit(dataset, y).get_cluster_probas(ts0)
        >>> probas.shape
        (3,)
        >>> probas  # doctest: +ELLIPSIS
        array([0.33..., 0.33..., 0.33...])
        >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=10000.,
        ...                                  random_state=0)
        >>> probas = model.fit(dataset, y).get_cluster_probas(ts0)
        >>> probas.shape
        (3,)
        >>> probas
        array([0.5, 0.5, 0. ])
        >>> ts1 = to_time_series([3, 2])
        >>> model.get_cluster_probas(ts1)
        array([0., 0., 1.])
        """
        Xi = check_array(Xi)
        diffs = Xi[np.newaxis, :] - self.cluster_.cluster_centers_[:, :len(Xi)]
        distances_clusters = np.linalg.norm(diffs, axis=(1, 2))
        average_distance = np.mean(distances_clusters)
        delta_k = 1. - distances_clusters / average_distance
        s_k = 1. / (1. + np.exp(-self.lamb * delta_k))
        return s_k / s_k.sum()

    def _expected_costs(self, Xi):
        r"""Compute expected future costs from an incoming time series `Xi`.

        This cost is computed, for a time horizon :math:`\tau`, as:

        .. math::

            \sum_k P(c_k | Xi) \sum_y P(y | c_k)
                \sum_\hat{y}
                P_{t+\tau}(\hat{y} | y, c_k) \delta_{y \neq \hat{y}}

        where:

        * :math:`P(c_k | Xi)` is obtained through a call to
        `get_cluster_probas`
        * :math:`P(y | c_k)` is stored in `pyck_`
        * :math:`P_{t+\tau}(\hat{y} | y, c_k) \delta_{y \neq \hat{y}}` is
        stored in `pyhatyck_`

        Parameters
        ----------
        Xi: numpy array, shape (t, d)
            A time series observed up to time t

        Returns
        --------
        cost : numpy array of shape (self.__len_X_ - t + 1, )
            Expected future costs for all time stamps from t to self.__len_X_

        Examples
        --------
        >>> from tslearn.utils import to_time_series
        >>> dataset = to_time_series_dataset([[1, 2, 3, 4, 5, 6],
        ...                                   [1, 2, 3, 4, 5, 6],
        ...                                   [1, 2, 3, 4, 5, 6],
        ...                                   [1, 2, 3, 3, 2, 1],
        ...                                   [1, 2, 3, 3, 2, 1],
        ...                                   [1, 2, 3, 3, 2, 1],
        ...                                   [3, 2, 1, 1, 2, 3],
        ...                                   [3, 2, 1, 1, 2, 3]])
        >>> y = [0, 0, 0, 1, 1, 1, 0, 0]
        >>> ts1 = to_time_series([3, 2])
        >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=10000.,
        ...                                  cost_time_parameter=1.,
        ...                                  random_state=0)
        >>> costs = model.fit(dataset, y)._expected_costs(ts1)
        >>> costs.shape
        (5,)
        >>> costs  # doctest: +ELLIPSIS
        array([2., 3., 4., 5., 6.])
        """
        proba_clusters = self.get_cluster_probas(Xi=Xi)
        truncated_t = Xi.shape[0]
        # pyhatyck_ is indexed by: t, k, y, yhat
        sum_pyhatyck = np.sum(
            self.pyhatyck_[truncated_t - self.min_t:],
            axis=-1
        )
        sum_pyhatyck = np.transpose(sum_pyhatyck, axes=(0, 2, 1))
        # sum_pyhatyck is now indexed by: t, y, k
        sum_global = np.sum(sum_pyhatyck * self.pyck_[np.newaxis, :], axis=1)
        cost = np.dot(sum_global, proba_clusters)
        return cost + self._cost_time(np.arange(truncated_t,
                                                self._X_fit_dims[1] + 1))

    def _get_prediction_time(self, Xi):
        """Compute optimal prediction time for the incoming time series Xi.
        """
        time_prediction = None
        for t in range(self.min_t, self._X_fit_dims[1] + 1):
            tau_star = np.argmin(self._expected_costs(Xi=Xi[:t]))
            if (t == self._X_fit_dims[1]) or (tau_star == 0):
                time_prediction = t
                break
        return time_prediction

    def _predict_single_series(self, Xi):
        """
        This function classifies a single time series xt

        Parameters
        ----------
        xt: vector
            a time series that is probably incomplete but that nonetheless we
            want to classify
        Returns
        -------
        int: the class which is predicted
        int : the time of the prediction
        float : the probability used for computing the cost
        float : the loss when classifying
        """
        t = self._get_prediction_time(Xi)
        pred = self.classifiers_[t].predict([Xi[:t]])[0]
        return pred, t

    def _predict_single_series_proba(self, Xi):
        """
        This function classifies a single time series xt

        Parameters
        ----------
        Xi: vector
            a time series that is probably incomplete but that nonetheless we
            want to classify
        Returns
        -------
        int: the class which is predicted
        int : the time of the prediction
        float : the probability used for computing the cost
        float : the loss when classifying
        """
        t = self._get_prediction_time(Xi)
        pred = self.classifiers_[t].predict_proba([Xi[:t]])[0]
        return pred, t

    def predict_class_and_earliness(self, X):
        """
        Provide predicted class as well as prediction timestamps.

        Prediction timestamps are timestamps at which a prediction is made in
        early classification setting.

        Parameters
        ----------
        X : array-like of shape (n_series, n_timestamps, n_features)
            Vector to be scored, where `n_series` is the number of time series,
            `n_timestamps` is the number of timestamps in the series
            and `n_features` is the number of features recorded at each
            timestamp.

        Returns
        -------
        array, shape (n_samples,)
            Predicted classes.
        array-like of shape (n_series, )
            Prediction timestamps.
        """

        X = check_array(X, allow_nd=True)
        check_is_fitted(self, '_X_fit_dims')
        X = check_dims(X, X_fit_dims=self._X_fit_dims,
                       check_n_features_only=True)
        y_pred = []
        time_prediction = []
        for i in range(0, X.shape[0]):
            cl, t = self._predict_single_series(X[i])
            y_pred.append(cl)
            time_prediction.append(t)
        return np.array(y_pred), np.array(time_prediction)

    def predict(self, X):
        """
        Provide predicted class.

        Parameters
        ----------
        X : array-like of shape (n_series, n_timestamps, n_features)
            Vector to be scored, where `n_series` is the number of time series,
            `n_timestamps` is the number of timestamps in the series
            and `n_features` is the number of features recorded at each
            timestamp.

        Returns
        -------
        array, shape (n_samples,)
            Predicted classes.
        """
        return self.predict_class_and_earliness(X)[0]

    def predict_proba_and_earliness(self, X):
        """
        Provide probability estimates as well as prediction timestamps.

        Prediction timestamps are timestamps at which a prediction is made in
        early classification setting.
        The returned estimates for all classes are ordered by the
        label of classes.

        Parameters
        ----------
        X : array-like of shape (n_series, n_timestamps, n_features)
            Vector to be scored, where `n_series` is the number of time series,
            `n_timestamps` is the number of timestamps in the series
            and `n_features` is the number of features recorded at each
            timestamp.

        Returns
        -------
        array-like of shape (n_series, n_classes)
            Probability of the sample for each class in the model,
            where classes are ordered as they are in ``self.classes_``.
        array-like of shape (n_series, )
            Prediction timestamps.
        """

        X = check_array(X, allow_nd=True)
        check_is_fitted(self, '_X_fit_dims')
        X = check_dims(X, X_fit_dims=self._X_fit_dims,
                       check_n_features_only=True)
        y_pred = []
        time_prediction = []
        for i in range(0, X.shape[0]):
            probas, t = self._predict_single_series_proba(X[i])
            y_pred.append(probas)
            time_prediction.append(t)
        return np.array(y_pred), np.array(time_prediction)

    def predict_proba(self, X):
        """
        Probability estimates.

        The returned estimates for all classes are ordered by the
        label of classes.

        Parameters
        ----------
        X : array-like of shape (n_series, n_timestamps, n_features)
            Vector to be scored, where `n_series` is the number of time series,
            `n_timestamps` is the number of timestamps in the series
            and `n_features` is the number of features recorded at each
            timestamp.

        Returns
        -------
        array-like of shape (n_series, n_classes)
            Probability of the sample for each class in the model,
            where classes are ordered as they are in ``self.classes_``.
        """
        return self.predict_proba_and_earliness(X)[0]

    def _cost_time(self, t):
        return t * self.cost_time_parameter

    def early_classification_cost(self, X, y):
        r"""
        Compute early classification score.

        The score is computed as:

        .. math::

            1 - acc + \alpha \frac{1}{n} \sum_i t_i

        where :math:`\alpha` is the trade-off parameter
        (`self.cost_time_parameter`) and :math:`t_i` are prediction timestamps.

        Parameters
        ----------
        X : array-like of shape (n_series, n_timestamps, n_features)
            Vector to be scored, where `n_series` is the number of time series,
            `n_timestamps` is the number of timestamps in the series
            and `n_features` is the number of features recorded at each
            timestamp.

        y : array-like, shape = (n_samples) or (n_samples, n_outputs)
            True labels for X.

        Returns
        -------
        float
            Early classification cost (a positive number, the lower the better)

        Examples
        --------
        >>> dataset = to_time_series_dataset([[1, 2, 3, 4, 5, 6],
        ...                                   [1, 2, 3, 4, 5, 6],
        ...                                   [1, 2, 3, 4, 5, 6],
        ...                                   [1, 2, 3, 3, 2, 1],
        ...                                   [1, 2, 3, 3, 2, 1],
        ...                                   [1, 2, 3, 3, 2, 1],
        ...                                   [3, 2, 1, 1, 2, 3],
        ...                                   [3, 2, 1, 1, 2, 3]])
        >>> y = [0, 0, 0, 1, 1, 1, 0, 0]
        >>> model = NonMyopicEarlyClassifier(n_clusters=3, lamb=1000.,
        ...                                  cost_time_parameter=.1,
        ...                                  random_state=0)
        >>> model.fit(dataset, y)  # doctest: +ELLIPSIS
        NonMyopicEarlyClassifier(...)
        >>> preds, pred_times = model.predict_class_and_earliness(dataset)
        >>> preds
        array([0, 0, 0, 1, 1, 1, 0, 0])
        >>> pred_times
        array([4, 4, 4, 4, 4, 4, 1, 1])
        >>> model.early_classification_cost(dataset, y)
        0.325
        """
        y_pred, pred_times = self.predict_class_and_earliness(X)
        acc = accuracy_score(y, y_pred)
        return (1. - acc) + np.mean(self._cost_time(pred_times))

    def _more_tags(self):
        # Because some of the data validation checks rely on datasets that are
        # too small to pass here (only 1 item in one of the clusters, hence no
        # stratified split possible)
        return {"no_validation": True}
#
# for yi in range(2):
#     plt.subplot(3, 3, 4 + yi)
#     for xx in X_train[y_pred == yi]:
#         plt.plot(xx.ravel(), "k-", alpha=.2)
#     plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
#     plt.xlim(0, sz)
#     plt.ylim(-4, 4)
#     if yi == 1:
#         plt.title("DBA $k$-means")

# Soft-DTW-k-means
print("Soft-DTW k-means")
sdtw_km = TimeSeriesKMeans(n_clusters=2, metric="softdtw", metric_params={"gamma_sdtw": .01},
                           verbose=True, random_state=seed)
y_pred = sdtw_km.fit_predict(xie_1)

for yi in range(3):
    plt.subplot(3, 3, 7 + yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    if yi == 1:
        plt.title("Soft-DTW $k$-means")


plt.subplot(3, 3, 3)
for xx in X_train:
    plt.plot(xx.ravel(), "g-", alpha=.2)
Esempio n. 14
0
def main(args):
    if args.data == 'simulation':
        window_size = 50
        path = './data/simulated_data/'
        n_cluster = 4
        augment = 5
    if args.data == 'wf':
        window_size = 2500
        path = './data/waveform_data/processed'
        n_cluster = 4
        augment = 500
    if args.data == 'har':
        window_size = 5
        path = './data/HAR_data/'
        n_cluster = 6
        augment = 100

    with open(os.path.join(path, 'x_train.pkl'), 'rb') as f:
        x = pickle.load(f)
    with open(os.path.join(path, 'state_train.pkl'), 'rb') as f:
        y = pickle.load(f)
    with open(os.path.join(path, 'x_test.pkl'), 'rb') as f:
        x_test = pickle.load(f)
    with open(os.path.join(path, 'state_test.pkl'), 'rb') as f:
        y_test = pickle.load(f)

    T = x.shape[-1]
    t = np.random.randint(window_size, T - window_size, len(x) * augment)
    x_window = np.array([
        x[i // augment, :, tt - window_size // 2:tt + window_size // 2]
        for i, tt in enumerate(t)
    ])
    y_window = np.round(
        np.mean(
            np.array([
                y[i // augment, tt - window_size // 2:tt + window_size // 2]
                for i, tt in enumerate(t)
            ]), -1))
    if args.data == 'wf':
        minority_index = np.logical_or(y_window == 1, y_window == 2)
        rand_index = np.random.randint(0, len(y_window), 200)
        y_window = np.concatenate(
            [y_window[minority_index], y_window[rand_index]], 0)
        x_window = np.concatenate(
            [x_window[minority_index], x_window[rand_index]], 0)
        x_window = x_window.transpose((0, 2, 1))  # shape:[n_samples, t_len, d]
        x_window = x_window[:, ::2, :]  # Decimate measurements for efficiency
    else:
        x_window = x_window.transpose((0, 2, 1))  # shape:[n_samples, t_len, d]

    t = np.random.randint(window_size, T - window_size, len(x_test) * augment)
    x_test_window = np.array([
        x_test[i // augment, :, tt - window_size // 2:tt + window_size // 2]
        for i, tt in enumerate(t)
    ])
    y_test_window = np.round(
        np.mean(
            np.array([
                y_test[i // augment,
                       tt - window_size // 2:tt + window_size // 2]
                for i, tt in enumerate(t)
            ]), -1))
    if 0:  #args.data =='wf':
        minority_index = np.logical_or(y_test_window == 1, y_test_window == 2)
        rand_index = np.random.randint(0, len(y_test_window), 150)
        y_test = np.concatenate(
            [y_test_window[minority_index], y_test_window[rand_index]], 0)
        x_test = np.concatenate(
            [x_test_window[minority_index], x_test_window[rand_index]], 0)
        x_test_window = x_test.transpose(
            (0, 2, 1))  # shape:[n_samples, t_len, d]
        x_test = x_test_window[:, ::
                               2, :]  # Decimate measurements for efficiency
    else:
        y_test = y_test_window
        x_test = x_test_window
        x_test = x_test.transpose((0, 2, 1))  # shape:[n_samples, t_len, d]

    accuracy, s_score, db_score, auc, auprc = [], [], [], [], []
    for cv in range(3):
        shuffled_inds = list(range(len(x_window)))
        random.shuffle(shuffled_inds)
        x_window = x_window[shuffled_inds]
        y_window = y_window[shuffled_inds]
        if args.data == 'wf':
            n_train = int(0.7 * len(x_window))
            x_train = x_window[:n_train]
            y_train = y_window[:n_train]
            x_test = x_window[n_train:]
            y_test = y_window[n_train:]
        else:
            x_train = x_window
            y_train = y_window

        knn = KNeighborsTimeSeries(n_neighbors=args.K,
                                   metric='dtw').fit(x_train)
        kmeans = TimeSeriesKMeans(n_clusters=n_cluster, metric='dtw')
        cluster_labels = kmeans.fit_predict(x_test)

        dist, ind = knn.kneighbors(x_test, return_distance=True)
        predictions = np.array(
            [y_train[np.bincount(preds).argmax()] for preds in ind])
        y_onehot = np.zeros((len(y_test), n_cluster))
        y_onehot[np.arange(len(y_onehot)), y_test.astype(int)] = 1
        prediction_onehot = np.zeros((len(y_test), n_cluster))
        prediction_onehot[np.arange(len(prediction_onehot)),
                          predictions.astype(int)] = 1

        accuracy.append(accuracy_score(y_test, predictions))
        auc.append(roc_auc_score(y_onehot, prediction_onehot))
        auprc.append(average_precision_score(y_onehot, prediction_onehot))
        s_score.append(
            silhouette_score(x_test.reshape((len(x_test), -1)),
                             cluster_labels))
        db_score.append(
            davies_bouldin_score(x_test.reshape((len(x_test), -1)),
                                 cluster_labels))

    print('\nSummary performance:')
    print('Accuracy: ', np.mean(accuracy) * 100, '+-', np.std(accuracy) * 100)
    print('AUC: ', np.mean(auc), '+-', np.std(auc))
    print('AUPRC: ', np.mean(auprc), '+-', np.std(auprc))
    print('Silhouette score: ', np.mean(s_score), '+-', np.std(s_score))
    print('Davies Bouldin score: ', np.mean(db_score), '+-', np.std(db_score))
Esempio n. 15
0
File: dtw.py Progetto: hazelsah/DTW
def dtw_clustering(data):
    #  TODO find n_clusters using elbow method
    model = TimeSeriesKMeans(n_clusters=5, metric="dtw", max_iter=1000)
    y_pred = model.fit_predict(data)
    return y_pred
def kmeans(data,clusters,year_of_interest,njobs): 
	#get data
	seed = 5
	np.random.seed(seed)
	#print('data shape is: ',data.shape)
	X_train=data.T
	#print(X_train)
	#print('shape is ', X_train.shape, ' before scaling')
	#X_train = np.expand_dims(np.zeros(X_train.shape),axis=2)
	#print('the shape after expand dims is: ', X_train.shape)
	#print('example before: ',X_train[:,10])
	#np.random.shuffle(X_train)
	#print(X_train.shape)
	#print('x train is: ', X_train)
	#X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train)
	#print(X_train.shape)
	#X_train[:,:-1]=np.squeeze(TimeSeriesScalerMeanVariance().fit_transform(X_train[:,:-1]),axis=2) #changed from 2 to 1 
	#print('x train is now: ', X_train)
	X_train = np.nan_to_num(X_train)
	#print('example after is: ', X_train)
	#print('X_train shape is ', X_train.shape, ' after scaling')
	#X_train = np.squeeze(X_train, axis=2)
	#X_train_labeled = np.append(X_train,labels)
	sz = X_train.shape[1]

	# Euclidean k-means
	# print("Euclidean k-means")
	# km = TimeSeriesKMeans(n_clusters=clusters, verbose=True, random_state=seed,n_jobs=20)
	# #print('km is ',km)
	# y_pred = km.fit_predict(np.nan_to_num(X_train))#[:,0][0])
	# #print(y_pred)
	# #print('y_pred is',y_pred)
	# cluster_dict = {}
	# cluster_centers = {}
	# plt.figure(figsize=(10,10))
	# #uncomment calls to plot if you want to see the figures
	# for yi in range(clusters):
	#     #print('cluster is: ', yi+1)
	#     time_series = {} #changed from list to dict
	#     plt.subplot(10, 5, yi + 1)
	#     #count = 0 
	#     for xx in X_train[y_pred == yi]:
	#         #time_series.append(xx[-1]) #removed [-1]
	#         time_series.update({xx[-1]:xx})
	#         #print('xxshape is ',xx.shape)
	#         #print('count is: ',count)
	#         #count +=1
	#         #print(xx[1])
	#         plt.plot(xx.ravel(), "darkblue", alpha=.2)
	#     cluster_dict.update({f'cluster_{yi+1}':time_series})
	#     cluster_centers.update({f'cluster_{yi+1}':km.cluster_centers_[yi]})

	#     plt.plot(km.cluster_centers_[yi].ravel(), "r-")
	#     #print(km.cluster_centers_[yi].shape)
	#     plt.xlim(0, sz)
	#     plt.ylim(-10, 10)
	#     plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
	#              transform=plt.gca().transAxes)
	#     if yi == 1:
	#         plt.title("Euclidean $k$-means")
	        

	#print(cluster_dict)
	#cluster_df = pd.DataFrame(dict([ (k,pd.Series(v)) for k,v in cluster_dict.items() ]))
	#print(cluster_df.iloc[:,0][0])
	# DBA-k-means
	print("DBA k-means")
	dba_km = TimeSeriesKMeans(n_clusters=clusters,
	                          n_init=3,
	                          metric="dtw",
	                          verbose=True,
	                          max_iter_barycenter=10,
	                          random_state=seed,
	                          n_jobs=njobs)
	y_pred = dba_km.fit_predict(np.nan_to_num(X_train))
	cluster_dict = {}
	cluster_centers = {}
	plt.figure(figsize=(10,10))

	for yi in range(clusters):
	    #print('the cluster number is: ', yi+1)
	    time_series = {} #changed from list to dict
	    plt.subplot(10, 5, yi + 1)

	    #plt.subplot(clusters, clusters, (clusters+1) + yi)
	    for xx in X_train[y_pred == yi]:
	        #print('the time series in this cluster look like: ',xx)
	        time_series.update({xx[-1]:xx})
	        plt.plot(xx.ravel(), "darkblue", alpha=.2)
	    cluster_dict.update({f'cluster_{yi+1}':time_series})
	    cluster_centers.update({f'cluster_{yi+1}':dba_km.cluster_centers_[yi]})
	    plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
	    plt.xlim(0, sz)
	    plt.ylim(0, 1) #changed from -10,10
	    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
	             transform=plt.gca().transAxes)
	    if yi == 1:
	        plt.title(f"DBA $k$-means {year_of_interest}")

	# #Soft-DTW-k-means
	# print("Soft-DTW k-means")
	# sdtw_km = TimeSeriesKMeans(n_clusters=clusters,
	#                            metric="softdtw",
	#                            metric_params={"gamma": .01},
	#                            verbose=True,
	#                            random_state=seed,
	#                            n_jobs=10)
	# y_pred = sdtw_km.fit_predict(np.nan_to_num(X_train))

	# for yi in range(clusters):
	#     plt.subplot(clusters, clusters, ((clusters*2)+1) + yi)
	#     for xx in X_train[y_pred == yi]:
	#         plt.plot(xx.ravel(), "k-", alpha=.2)
	#     plt.plot(sdtw_km.cluster_centers_[yi].ravel(), "r-")
	#     plt.xlim(0, sz)
	#     plt.ylim(-10, 10)
	#     plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
	#              transform=plt.gca().transAxes)
	#     if yi == 1:
	#         plt.title("Soft-DTW $k$-means")

	plt.tight_layout()
	#plt.show()
	#plt.close(fig)
	plt.clf()
	plt.close('all')
	#print(cluster_centers)
	return cluster_dict,cluster_centers
Esempio n. 17
0
class Kmean(cs):
    """Classe de partitionnement des donnees avec l'algorithm K-mean

    Parameters:
        * ss : SeriesSupp
            instance du manager de series temporelles

    Variables:
        * seed: int
            Valeur d'initialisation de l'algo, random.
        * counter: Counter
            repartition des objets au sein des clusters
        * km: TimeSeriesKMeans
            Instance de l'algo
        * clust_name: String
            Nom de l'algo(affichage des plots)
        * metric: String
            Choix du metrics utilise, principalement softdtw ici car tres efficace et rapide
    """
    def __init__(self, ss):
        super().__init__(ss)
        self.seed = 0
        np.random.seed(self.seed)
        self.counter = None
        self.km = None
        self.clust_name = "Kmean"
        self.metric = "softdtw"

    def k_init(self, v = True):
        """
        initialisation de l'instance de l'algorithm avec les parametres actuels

        Parameters:
            * v: boolean
                Verbose, affiche les info lie au partitionnement

        Returns:
            NA
        """
        self.km = TimeSeriesKMeans(n_clusters = self.n, metric = self.metric, metric_params = {"gamma_sdtw": .01}, verbose = v, random_state = self.seed)

    def k_fit(self):
        """
        Effectue le partitionnement

        Parameters:
            NA

        Returns:
            NA
        """
        self.ts_clust = self.km.fit_predict(self.ts)

    def cluster_counter(self):
        """
        Compte les objets au sein des clusters

        Parameters:
            NA

        Returns:
            NA
        """
        self.counter = Counter(self.ts_clust)
            distance = dtw.distance(time_series_scaled[i],
                                    time_series_scaled[j],
                                    psi=1)

            distance_matrix[i, j] = distance

# now, let's call our clustering algorithms, with a pre-computed distance matrix
if False:  # commented out, at the moment
    plt.figure(figsize=(10, 7))
    plt.title("Dendrograms")
    dend = shc.dendrogram(
        shc.linkage(
            time_series_scaled,
            method='ward',
            metric=lambda u, v: distance_matrix(time_series_scaled.index(u),
                                                time_series_scaled.index(v))))
    plt.axhline(y=cutoff_distance, color='r', linestyle='--')
    plt.savefig(os.path.join(folder_name, "hierarchical-dendogram.pdf"))

print("Performing agglomerative clustering...")
n_clusters = 10
clusterer = TimeSeriesKMeans(n_clusters=n_clusters,
                             metric='dtw',
                             random_state=0)
clusterer.fit_predict(time_series_scaled)

print("Clusters:")
for c in range(0, clusterer.labels_.max() + 1):
    print("- Cluster #%d has %d time series" %
          (c, sum(clusterer.labels_ == c)))
Esempio n. 19
0
    print(f"\tLabels Calculated, Elapsed: {time.time() - t}")

    #sc = silhouette_score(X, labels_bis, metric="softdtw")
import pickle
pickle.dump(distortions, open("output/ts_distortions.pkl", "wb"))

plt.plot(range(2, 13), distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
 """

# Setting the algorithm as the cluster with the elbow in the curve (from the above loop, it was 3)
km_bis = TimeSeriesKMeans(n_clusters=3, metric="softdtw")
labels_bis = km_bis.fit_predict(X)

# Plot the clusters
centroids = km_bis.cluster_centers_
color_palette = {"0": "#01AFB8", "1": "#196E9F", "2": "#D3D3D3"}
for i in range(0,3):
    col = color_palette[str(i)]
    plt.scatter(X[labels_bis == i , 0] , X[labels_bis == i , 1] , label = i, color = col)
plt.scatter(centroids[:,0] , centroids[:,1] , s = 80, color = 'black')
plt.legend()
plt.annotate("The black dots indicate the cluster's centroid", xy = (0,0))
plt.show()

# Append the clusters to the severity dataframe
to_merge = pd.DataFrame({"FIPS":fips, "label":labels_bis})
mh_historical = pd.merge(mh_historical, to_merge, on= "FIPS", how = "left")
Esempio n. 20
0
def clusters():
    """
        Display correlation plot for the researcher.

        This webpage is only for the role researcher.
        :return: Correlation plot
        """
    check_access_right(forbidden='user', redirect_url='control.index')

    currentVideo, vid_dict, n_clusters = get_video_information(
        request.args.get('vid'), request.args.get('cluster'))
    _, data = collect_mongodbobjects(currentVideo[0])

    ### set desired amount of clusters
    clustervals = np.arange(1, 10, 1)

    if _ == False or data.empty:
        return render_template("researcher/clusters.html",
                               the_div="There is no data for this video!",
                               the_script="",
                               vid_dict=vid_dict,
                               currentVideo=currentVideo,
                               currentCluster=n_clusters,
                               clustervals=clustervals,
                               currentVariable='-',
                               variable_list=[])

    data, currentVariable, variable_list = extract_variable(
        data, request.args.get('variable'))

    interpolators, max_t = get_interpolators(data, currentVariable)
    xs = np.arange(0, int(max_t) + 1.5, 1)

    # Generate data
    user_timeseries = [[interpolator(xs)] for interpolator in interpolators]

    seed = np.random.randint(0, int(1e5), 1)[0]
    np.random.seed(seed)

    # Set cluster count
    # n_clusters = 3
    if n_clusters > np.array(user_timeseries).shape[0]:
        n_clusters = np.array(user_timeseries).shape[0]

    # Euclidean k-means
    km = TimeSeriesKMeans(n_clusters=n_clusters,
                          verbose=True,
                          random_state=seed)
    y_pred = km.fit_predict(user_timeseries)

    # Generate plots and calculate statistics
    all_plots = ""
    all_scripts = ""
    plots = []

    ### TODO MAYBE: intra-cluster correlation with rpy2. Might not work with matrices
    """    valmatrix = np.empty([24,151])
    for iii in range(24):
        valmatrix[iii, :] = user_timeseries[iii][0]
    print(type(valmatrix), valmatrix.shape)
    print(type(valmatrix[0]), len(valmatrix[0]))
    print(type(valmatrix[0][0]))
    r_icc = importr("ICC", lib_loc="C:/Users/Lauri Lode/Documents/R/win-library/3.4")
    #m = r.matrix(FloatVector(valmatrix.flatten()), nrow=24)
    df = DataFrame({"groups": IntVector(y_pred),
        "values": FloatVector(valmatrix.flatten())})
    icc_res = r_icc.ICCbare("groups", "values", data=df)
    icc_val = icc_res[0]
    print("ICC" + str(icc_val))"""

    for yi in range(n_clusters):
        p = figure()
        n = 0
        values = km.cluster_centers_[yi].ravel()
        centerMean = np.mean(km.cluster_centers_[yi].ravel())
        varsum = 0
        for xx in range(0, len(y_pred)):
            if y_pred[xx] == yi:
                n = n + 1
                for iii in range(len(user_timeseries[xx][0])):
                    varsum = varsum + eucl(user_timeseries[xx][0][iii],
                                           values[iii]) / len(
                                               user_timeseries[xx][0])

                p.line(range(0, len(user_timeseries[xx][0])),
                       user_timeseries[xx][0],
                       line_width=0.3)
        varsum = np.sqrt(varsum)

        titleString = "C#" + str(yi + 1) + ", n: " + str(n) + ", μ: " + str(
            np.round(centerMean, decimals=3)) + ", σ: " + str(
                np.round(varsum, decimals=3)) + ", σ²: " + str(
                    np.round(varsum**2, decimals=3))
        t = Title()
        t.text = titleString
        p.title = t
        p.line(range(0, len(values)), values, line_width=2)
        plots.append(p)

    # Get plot codes
    script, div = components(
        gridplot(plots, ncols=3, plot_width=350, plot_height=300))

    return render_template("researcher/clusters.html",
                           the_div=div,
                           the_script=script,
                           vid_dict=vid_dict,
                           currentVideo=currentVideo,
                           currentCluster=n_clusters,
                           clustervals=clustervals,
                           variable_list=variable_list,
                           currentVariable=currentVariable)
Esempio n. 21
0
 # making an DataFrame to store words in column names and dates in indexes.
 tfidf_monthly_dataframe = pd.DataFrame(
     tfidf_avg_monthly.toarray(),
     columns=vocabulary["word"],
     index=pd.to_datetime({
         "year": months_grouped.year,
         "month": months_grouped.month,
         "day": 1
     }),
 )
 # time series - each row(word) is one time serie
 # each time series is an array of 72 months.
 time_series = to_time_series(tfidf_monthly_dataframe.values.transpose())
 N_clusters = 7
 model = TimeSeriesKMeans(N_clusters)
 vocabulary["cluster"] = model.fit_predict(time_series)
 # mapping cluster numbers to colors
 colors = pd.DataFrame(pl.cm.jet(np.linspace(0, 1, N_clusters)))
 vocabulary.sort_values(["cluster", "relevance"],
                        inplace=True,
                        ascending=False)
 # getting the most relevant words for each topic
 topics = (vocabulary[["cluster", "word"]].groupby("cluster").agg({
     "word":
     lambda words: ", ".join(words[:15]),
 })).reset_index().rename({'Index': 'cluster'})
 clusters_centers = pd.DataFrame(
     model.cluster_centers_.reshape((N_clusters, -1)).transpose(),
     columns=topics["word"],
     index=pd.to_datetime({
         "year": months_grouped.year,
Esempio n. 22
0
else:
    pngfile = "ts8plot_fgrp" + str(
        args.fgrp) + "_" + args.start_month + "_" + str(args.months) + "m.png"
if os.path.exists(pngfile):
    os.remove(pngfile)
fig.savefig(pngfile, dpi=300)
print("Saved", pngfile)

if args.info:
    plt.show(block=True)
    sys.exit(1)

# ----

km = TimeSeriesKMeans(n_clusters=num, verbose=True, random_state=seed)
y_pred = km.fit_predict(X)
print(y_pred)

plt.figure(figsize=(8, 2 * num))
for yi in range(num):
    plt.subplot(num, 1, yi + 1)
    for xx in X[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    #plt.xlim(0, sz)
    #plt.ylim(0, 4)
    if yi == 0:
        plt.title("Euclidean $k$-means")
plt.tight_layout()

# ----
Esempio n. 23
0
newarray = np.dstack(data)
print(newarray.shape)
# To get the shape to be Nx10x10, you could  use rollaxis:
newarray = np.rollaxis(newarray, -1)
print(newarray.shape)
seed = 0
# Keep only 50 time series
X_train = TimeSeriesScalerMeanVariance().fit_transform(newarray[:280])
# Make time series shorter
#X_train = TimeSeriesResampler(sz=40).fit_transform(X_train)
sz = X_train.shape[1]

# Euclidean k-means
print("Euclidean k-means")
km = TimeSeriesKMeans(n_clusters=4, verbose=True, random_state=seed)
y_pred = km.fit_predict(X_train)

plt.figure()
for yi in range(4):
    #plt.subplot(2, 2, yi + 1)
    for xx in X_train[y_pred == yi]:
        plt.subplot(2, 2, yi + 1)
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    # plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
    #          transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("Euclidean $k$-means")
Esempio n. 24
0
# Set up a K range to iterate through for DTW-KMeans model
K_range = np.arange(args.Kmin, args.Kmax, args.stepsize)
# Three ways to measure the performances of DTW-KMeans model
Sum_of_squared_distances = []
ch_indexs = []
silhouette_scores = []
for n_clusters in K_range:
    # soft-DTW-Kmeans
    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility
    clusterer = TimeSeriesKMeans(n_clusters=n_clusters,
                                 metric="softdtw",
                                 metric_params={"gamma": .01},
                                 verbose=False,
                                 random_state=10)
    cluster_labels = clusterer.fit_predict(gene_expression_matrix)
    print('The Shape of Cluster Centers are {}'.format(
        clusterer.cluster_centers_.shape))
    # The squared distance for Elbow Method
    # Select optimal number of clusters by fitting the model
    # with a range of K values
    Sum_of_squared_distances.append(clusterer.inertia_)
    print("For n_clusters =", n_clusters, "The sum of squared distance is :",
          clusterer.inertia_)
    #Compute the Calinski and Harabasz score.
    #This gives ratio between the within-cluster dispersion and the
    # between-cluster dispersion.
    ch_score = calinski_harabasz_score(gene_expression_matrix, cluster_labels)
    ch_indexs.append(ch_score)
    print("For n_clusters =", n_clusters, "The calinski_harabasz_score is :",
          ch_score)
Esempio n. 25
0
st.set_option('deprecation.showPyplotGlobalUse', False)

data, ts_data, = joblib.load("data/data.job")

location = st.sidebar.selectbox('Location:', ts_data['Location'].unique())

clusters = st.sidebar.slider('Clusters:', 2, 6)

ts = to_time_series_dataset(
    ts_data[ts_data.Location == location].TimeSeries.values)

st.subheader(f"Location: {location}, Devices: {len(ts)}, Clusters: {clusters}")
st.text("")

km = TimeSeriesKMeans(n_clusters=clusters, metric="dtw", n_jobs=7)
labels = km.fit_predict(ts)

df = ts_data[ts_data.Location == location].copy()
df['Cluster'] = labels.T

for cluster in np.sort(np.unique(labels)):
    cdf = df[df.Cluster == cluster]
    for k, s in cdf.TimeSeries.items():
        s = pd.DataFrame(s, columns=['CPU Idle'])
        s['Hour'] = s.index
        sns.lineplot(data=s, x="Hour", y="CPU Idle", alpha=0.1)
    sns.lineplot(data=km.cluster_centers_[cluster], legend=False)
    plt.title(f"Cluster: {cluster}", size=12)
    st.pyplot()
    # st.dataframe(cdf[['Device', 'Day']].reset_index(drop=True), width=500)
ytrain = pickle.load(open(data_path + "training_labels.pck","rb"))

# x_train = TimeSeriesScalerMinMax().fit_transform(xtrain[:260]) #shapes comparison
x_train = TimeSeriesScalerMeanVariance().fit_transform(xtrain[:500]) #variance comparison
x_train = TimeSeriesResampler(sz=500).fit_transform(x_train)
sz = x_train.shape[1]

print("DBA k-means")
dba_km = TimeSeriesKMeans(n_clusters=10,
                          n_init=1,
                          metric="dtw",
                          verbose=True,
                          max_iter_barycenter=10,
                          random_state=seed)

y_pred = dba_km.fit_predict(x_train)

plt.figure()
for yi in range(10):
    plt.subplot(10, 1, yi+1)
    for xx in x_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-", alpha=.2)
    plt.plot(dba_km.cluster_centers_[yi].ravel(), "r-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.text(0.55, 0.85,'Cluster %d' % (yi + 1),
             transform=plt.gca().transAxes)
    if yi == 1:
        plt.title("DBA $k$-means")
plt.show()
Esempio n. 27
0
def main(args):

    data_dir = './Data/User Categorization/'

    if args.method == 'K':
        print('Working on K-means clustering')
        ts_dataset = []

        #Only take the first 500 unique ID's
        n_samples = 500

        for i in range(n_samples):
            csv_file = pd.read_csv(data_dir + str(i) + '.csv')
            time_series_df = csv_file[(~csv_file['f_1'].isnull())
                                      & (~csv_file['f_2'].isnull())]
            time_series_seq = list(time_series_df[['f_1', 'f_2',
                                                   'f_3']].values)
            ts_dataset.append(time_series_seq)

        #Preparing Time-series dataset
        formatted_dataset = to_time_series_dataset(ts_dataset)

        silhouette_scores = []
        n_clusters = [2, 3, 4, 5, 6]

        for cluster in n_clusters:
            km = TimeSeriesKMeans(n_clusters=cluster,
                                  metric="dtw",
                                  verbose=True,
                                  max_iter=5)
            y_pred = km.fit_predict(formatted_dataset)
            s_score = silhouette_score(formatted_dataset, y_pred, metric="dtw")
            silhouette_scores.append(s_score)

        sns.lineplot(x=n_clusters, y=silhouette_scores, sort=False)

        #Optimal clusters
        km = TimeSeriesKMeans(n_clusters=2,
                              metric="dtw",
                              verbose=True,
                              max_iter=5)
        y_pred = km.fit_predict(formatted_dataset)
        df = pd.DataFrame(data=y_pred, columns=['Cluster No.'])
        df.to_csv('./kmeans_clustering.csv', index=False)

        #Visualise Clusters
        sz = formatted_dataset.shape[1]
        plt.figure(figsize=(20, 20))

        for yi in range(2):
            plt.subplot(3, 3, 2 + yi)
            for xx in formatted_dataset[y_pred == yi]:
                plt.plot(xx.ravel(), "k-", alpha=.2)
            plt.plot(km.cluster_centers_[yi].ravel(), "r-")
            plt.xlim(0, sz)
            plt.ylim(-500000, 500000)
            plt.text(0.55,
                     0.85,
                     'Cluster %d' % (yi + 1),
                     transform=plt.gca().transAxes)
            if yi == 1:
                plt.title("DTW $k$-means")
        plt.tight_layout()
        plt.show()

    if args.method == 'H':
        #Hierarchical clustering
        print('Working on Hierarchical clustering')
        #Build distance matrix
        manual_dist_matrix = True
        n_samples = 500

        if manual_dist_matrix == False:
            distance_matrix = np.zeros(shape=(n_samples, n_samples))

            for i in range(n_samples):
                for j in range(n_samples):
                    sequence_1_df = pd.read_csv('./Data/User Categorization/' +
                                                str(i) + '.csv')
                    sequence_2_df = pd.read_csv('./Data/User Categorization/' +
                                                str(j) + '.csv')

                    seq_1 = sequence_1_df[(~sequence_1_df['f_1'].isnull())
                                          & (~sequence_1_df['f_2'].isnull())]
                    seq_2 = sequence_2_df[(~sequence_2_df['f_1'].isnull())
                                          & (~sequence_2_df['f_2'].isnull())]

                    x = seq_1[['f_1', 'f_2', 'f_3']].values
                    y = seq_2[['f_1', 'f_2', 'f_3']].values

                    distance, path = fastdtw(x, y, dist=euclidean)

                    if i != j:
                        distance_matrix[i, j] = distance

            savetxt('distance_matrix.csv', distance_matrix, delimiter=',')

        distance_matrix = np.genfromtxt('distance_matrix.csv', delimiter=',')
        linkage_matrix = hierarchical_clustering(distance_matrix)

        # select maximum number of clusters
        cluster_labels = fcluster(linkage_matrix, 4, criterion='maxclust')
        print(np.unique(cluster_labels))

        categorization_df = []
        files_list = os.listdir('./Data/User Categorization')

        for files in files_list:
            csv_file = pd.read_csv('./Data/User Categorization/' + str(files))
            unique_id = files[:-4]
            csv_file['ID'] = unique_id
            categorization_df.append(csv_file)

        df = pd.concat(categorization_df, axis=0, ignore_index=True)

        #filter out null values
        filtered_df = df[(~df['f_1'].isnull()) & (~df['f_2'].isnull())]

        df_vis = filtered_df.sort_values(by='ID')
        df_vis['ID'] = df_vis['ID'].astype('int')
        df_vis = df_vis[df_vis['ID'] <= 499].sort_values(by='ID').reset_index(
            drop=True)
        df_vis_fil = df_vis.groupby('ID')['f_1', 'f_2',
                                          'f_3'].mean().reset_index()
        df_vis_fil['Cluster'] = cluster_labels
        df_vis_fil.to_csv('./hier_clustering.csv', index=False)

        #Plotting Visualisation 3D scatterplot
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')

        x = np.array(df_vis_fil['f_1'])
        y = np.array(df_vis_fil['f_2'])
        z = np.array(df_vis_fil['f_3'])

        ax.scatter(x, y, z, marker="s", c=df_vis_fil["Cluster"], cmap="RdBu")

        plt.show()

    else:
        print('Please input K or H clustering method correctly')
Esempio n. 28
0
for i in appliance_id_list:
    sub_df = df[df['appliance_id'] == i].iloc[:, 2:]
    num = sub_df.shape[0]
    #if num<=10 ---k=1
    #if num>10 ---k=2
    if (num <= 10):
        k = 1
    else:
        k = 2
    ts_array = sub_df.values
    ts_scaled = TimeSeriesScalerMeanVariance().fit_transform(ts_array)
    km = TimeSeriesKMeans(n_clusters=k,
                          metric="dtw",
                          verbose=True,
                          random_state=0)
    y_pred = km.fit_predict(ts_scaled)
    #     n=np.argmax(np.bincount(y_pred))

    ts_array_all.append(km.cluster_centers_.ravel())

#找到每个设备的聚类中心后,在对聚类中心做一次聚类
ts_array_all = np.array(ts_array_all)
ts_scaled = TimeSeriesScalerMeanVariance().fit_transform(ts_array_all)
km = TimeSeriesKMeans(n_clusters=6, metric="dtw", verbose=True, random_state=0)
y_pred = km.fit_predict(ts_scaled)
#画图的部分
sample_size = 10000  #ts_array_all的行数


def get_proportion(y_pred):
    df_y = pd.DataFrame(y_pred, columns=['y'])
    (first_time_series_test, second_time_series_test))
multivariate_time_series_test = to_time_series(multivariate_test)
print(multivariate_time_series_test.shape)

#clustering
from tslearn.clustering import TimeSeriesKMeans, KernelKMeans, silhouette_score
#fit the algorithm on train data
#tune the hyperparameters possible metrics: euclidean, dtw, softdtw
km_dba = TimeSeriesKMeans(n_clusters=2,
                          metric="softdtw",
                          max_iter=5,
                          max_iter_barycenter=5,
                          random_state=0).fit(multivariate_time_series_train)
km_dba.cluster_centers_.shape
#prediction on train data
prediction_train = km_dba.fit_predict(multivariate_time_series_train, y=None)
len(prediction_train)
#prediction on test data
prediction_test = km_dba.fit_predict(multivariate_time_series_test, y=None)
len(prediction_test)
prediction_test

#accuracy of the clustering on the train data
silhouette_score(multivariate_time_series_train,
                 prediction_train,
                 metric="softdtw")
#accuracy of the clustering on the test data
silhouette_score(multivariate_time_series_test,
                 prediction_test,
                 metric="softdtw")
Esempio n. 30
0
    def GetClustersMonthlyAvg(self,
                              sites,
                              variableCode,
                              n_cluster=3,
                              methodCode=None,
                              qualityControlLevelCode=None,
                              timeUTC=False):
        """
        Gets "n" number of clusters using dtw time series interpolation for a given variable

        Args:

            sites: response from the GetSites() function. Performance of the fuction can be given if the resuls of the GetSitesByVariable() function is passed instead.
            variableCode: string representing the variable code for the time series clusters of the given sites.
            n_clusters: integer representing the number of cluster to form.
            methodCode: method code for data extraction for the given variable.
            qualityControlLevelCode: The ID of the quality control level.Typically 0 is used for raw dataand 1 is used for quality controlled data.
                To get a list of possible quality controllevel IDs, see qualityControlLevelCode column in the output of GetSiteInfo(). If qualityControlLevelCode is not specified,
                then the observations in the output data.frame won’t befiltered by quality control level code.
            timeUTC: Boolean to use the UTC time instead of the time of the observation.
        Returns:

            An array of arrays of the following structure [monthly averages array, cluster_id]

            [[[0.141875, 0.1249375, 0.0795, 0.12725, 0.0877, 0.0, 0.09375, 0.1815, 0.15437499999999998, 0.164625, 0.1614, 0.20900000000000002], 1],
            [[0.1, 0.08662500000000001, 0.0414025, 0.048, 0.052, 0.0, 0.1105, 0.015, 0.06625, 0.10587500000000001, 0.0505, 0.046125], 0],
            [[0.2265, 0.27225, 0.17407499999999998, 0.13475, 0.14525, 0.129, 0.17825, 0.210625, 0.103125, 0.0, 0.23675], 2]]
        Example::

            url_testing = "http://hydroportal.cuahsi.org/para_la_naturaleza/cuahsi_1_1.asmx?WSDL"
            water = WaterMLOperations(url = url_testing)
            sites = water.GetSites()
            firstSiteFullSiteCode = sites[0]['fullSiteCode']
            siteInfo = water.GetSiteInfo(firstSiteFullSiteCode)['siteInfo']
            clusters = water.getClustersMonthlyAvg(sites,siteInfo[0]['variableCode'])
        """
        timeseries = []
        timeSerie_cluster = []
        try:
            for site in sites:
                # site_full_code = f'{site["network"]}:{site["sitecode"]}'
                site_full_code = site['fullSiteCode']
                try:
                    siteInfo = self.GetSiteInfo(site_full_code)['siteInfo']
                    for sinfo in siteInfo:
                        if sinfo['variableCode'] == variableCode:
                            variable_full_code = sinfo['fullVariableCode']
                            start_date = sinfo['beginDateTime'].split('T')[0]
                            end_date = sinfo['endDateTime'].split('T')[0]

                            if timeUTC is True:
                                start_date = sinfo['beginDateTimeUTC'].split(
                                    'T')[0]
                                end_date = sinfo['endDateTimeUTC'].split(
                                    'T')[0]
                            variableResponse = self.GetValues(
                                site_full_code,
                                variable_full_code,
                                start_date,
                                end_date,
                                methodCode=methodCode,
                                qualityControlLevelCode=qualityControlLevelCode
                            )
                            m_avg = self.GetMonthlyAverage(variableResponse)
                            timeseries.append(to_time_series(m_avg))
                            timeSerie_cluster.append([m_avg])
                            break
                except Exception as e:
                    print(e)
                    print("the current site does not contain siteInformation")
            formatted_time_series = to_time_series_dataset(timeseries)
            model = TimeSeriesKMeans(n_clusters=n_cluster,
                                     metric="dtw",
                                     max_iter=10)
            y_pred = model.fit_predict(formatted_time_series)
            for tc, y in zip(timeSerie_cluster, y_pred):
                tc.append(y)
            return timeSerie_cluster
        except KeyError as e:
            # print(e)
            return timeSerie_cluster
        return timeSerie_cluster