def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._knn = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.hyperparams["n_neighbors"],
            metric=self.hyperparams["distance_metric"],
            weights=self.hyperparams["sample_weighting"],
        )
        self._scaler = TimeSeriesScalerMinMax()
        self._is_fit = False
Beispiel #2
0
    def _func(self, data: np.ndarray, img_info_path: str,
              roi_state: dict) -> np.ndarray:
        if 'raw_min_max' in roi_state.keys():
            raw_min_max = roi_state['raw_min_max']

        else:
            cnmf_idx = roi_state['cnmf_idx']
            img_info_path = os.path.join(self.proj_path, img_info_path)
            roi_states = pickle.load(open(img_info_path, 'rb'))['roi_states']

            idx_components = roi_states['cnmf_output']['idx_components']

            list_ix = np.argwhere(idx_components == cnmf_idx).ravel().item()

            state = roi_states['states'][list_ix]

            if not state['cnmf_idx'] == cnmf_idx:
                raise ValueError(
                    'cnmf_idx from ImgInfoPath dict and DataFrame ROI_State dict do not match.'
                )

            raw_min_max = state['raw_min_max']

        raw_min = raw_min_max['raw_min'][self.option]
        raw_max = raw_min_max['raw_max'][self.option]

        if raw_min >= raw_max:
            self.excluded += 1
            return np.NaN

        return TimeSeriesScalerMinMax(
            value_range=(raw_min, raw_max)).fit_transform(data).ravel()
def tsclusteringN(ts_data, names):
    # クラスタリング

    # 正規化
    ts_dataset = TimeSeriesScalerMinMax().fit_transform(ts_data)

    metric = 'dtw'
    n_clusters = [n for n in range(2, 6)]
    for n in n_clusters:
        print('クラスター数 =', n)

        # metricが「DTW」か「softdtw」なら異なるデータ数の時系列データでもOK
        km = TimeSeriesKMeans(n_clusters=n,
                              metric=metric,
                              verbose=False,
                              random_state=1).fit(ts_dataset)

        # クラスタリングの結果
        print('クラスタリング結果 =', km.labels_)

        # -1から1の範囲の値。シルエット値が1に近く、かつシルエット値をプロットしたシルエット図でクラスター間の幅の差が最も少ないクラスター数が最適
        # 今回はシルエット値のみを確認
        print('シルエット値 =',
              silhouette_score(ts_dataset, km.labels_, metric=metric))
        print()
Beispiel #4
0
    def __ScaleData(self, input_data):
        ''' 
            scale input data to range [0,1]

            parameters:
                input_data        : input data to rescale
        '''

        return TimeSeriesScalerMinMax().fit_transform(input_data)
Beispiel #5
0
def dataImport(name):

    if not os.path.exists("../Classifier/TimeSeriesFiles/" + name):
        url = "http://www.timeseriesclassification.com/Downloads/%s.zip" % name
        extract_from_zip_url(url,
                             "../Classifier/TimeSeriesFiles/" + name + "/",
                             verbose=False)

    data_train = numpy.loadtxt("../Classifier/TimeSeriesFiles/" + name + "/" +
                               name + "_TRAIN.txt")
    data_test = numpy.loadtxt("../Classifier/TimeSeriesFiles/" + name + "/" +
                              name + "_TEST.txt")
    X_train = to_time_series_dataset(data_train[:, 1:])
    y_train = data_train[:, 0].astype(numpy.int)
    X_test = to_time_series_dataset(data_test[:, 1:])
    y_test = data_test[:, 0].astype(numpy.int)
    X_train = TimeSeriesScalerMinMax().fit_transform(X_train)
    X_test = TimeSeriesScalerMinMax().fit_transform(X_test)
    return X_train, y_train, X_test, y_test
    def __generateRefPrice(self, curPrice, seedPrice, priceRange):
        priceMin = min(curPrice, seedPrice / 1.05 * (1 + numpy.random.uniform(-priceRange * 0.1, priceRange * 0.4)))
        priceMax = max(curPrice, seedPrice * 1.05 * (1 + numpy.random.uniform(-priceRange * 0.4, priceRange * 0.1)))
        data_len = numpy.random.randint(10000, 30000)

        # assert curPrice>=priceMin and curPrice<=priceMax,f"error: {curPrice}, {priceMin}, {priceMax}"
        def smooth_data(data):
            x = numpy.arange(0, len(data), 1)
            x_new = numpy.arange(0, max(x), 0.01)
            func = interpolate.interp1d(x, data, kind='quadratic')
            smoothed = func(x_new)
            return smoothed

        while True:
            dataset = random_walks(n_ts=1, sz=data_len * 2)
            scaler = TimeSeriesScalerMinMax(min=float(priceMin), max=float(priceMax))
            dataset_scaled = scaler.fit_transform(dataset)[0, :, 0]
            for i in range(0, data_len):
                if abs(dataset_scaled[i] - curPrice) / curPrice < 0.001:
                    # return list(smooth_data(dataset_scaled[i:i+data_len]))
                    with open('price.txt', 'w+') as f:
                        f.writelines([f'{p}\n' for p in dataset_scaled[i:i + data_len]])
                    return list(dataset_scaled[i:i + data_len])
Beispiel #7
0
 def _preprocess_series(self, X):
     if self.scale:
         X = TimeSeriesScalerMinMax().fit_transform(X)
     else:
         X = to_time_series_dataset(X)
     if self.max_size is not None and self.max_size != X.shape[1]:
         if X.shape[1] > self.max_size:
             raise ValueError("Cannot feed model with series of length {} "
                              "max_size is {}".format(
                                  X.shape[1], self.max_size))
         X_ = numpy.zeros((X.shape[0], self.max_size, X.shape[2]))
         X_[:, :X.shape[1]] = X
         X_[:, X.shape[1]:] = numpy.nan
         return X_
     else:
         return X
def tsclustering(ts_data, names):
    # 正規化
    ts_dataset = TimeSeriesScalerMinMax().fit_transform(ts_data)

    n_clusters = 2
    metric = 'dtw'

    # metricが「DTW」か「softdtw」なら異なるデータ数の時系列データでもOK
    km = TimeSeriesKMeans(n_clusters=n_clusters,
                          metric=metric,
                          verbose=False,
                          random_state=1).fit(ts_dataset)

    # クラスタリングの結果
    print('クラスタリング結果 =', km.labels_)

    plot_clustering(km, ts_dataset, names, n_clusters)
Beispiel #9
0
    # get current working directory
    working_dir_path = Path.cwd()
    sys.path.append(str(working_dir_path))

    # Load the dataset
    raw_data = pd.read_csv(os_path.join(working_dir_path,
                                        "./data/train_curves.csv"),
                           header=None)
    time_series_train = to_time_series_dataset(raw_data)

    labels_train = genfromtxt(os_path.join(
        working_dir_path, "./data/train_clustering_result.csv"),
                              delimiter=',')

    # Normalize the time series
    time_series_train = TimeSeriesScalerMinMax().fit_transform(
        time_series_train)

    # Get dimensions of the dataset
    n_time_series, time_series_size = time_series_train.shape[:2]
    n_classes = len(set(labels_train))

    # We will extract 2 shapelets and align them with the time series
    shapelet_sizes = {10: 2}

    # Define the model
    shapelet_classification_model = LearningShapelets(
        n_shapelets_per_size=shapelet_sizes,
        weight_regularizer=0.0001,
        optimizer=Adam(lr=0.01),
        max_iter=300,
        verbose=1,
def k_means_clustering(sd_log):
    """
    k_means clustering of all features using dtw for multivariate time series
    :param sd_log: sd_log object
    :return: cluster_metrics_dict: dict with clusters as key and features as values
    """
    from tslearn.clustering import TimeSeriesKMeans, silhouette_score
    from tslearn.utils import to_time_series_dataset
    from tslearn.preprocessing import TimeSeriesScalerMinMax

    data = sd_log.data
    # TODO handle outliers
    tmp = sd_log.waiting_time
    data.drop(columns=[sd_log.waiting_time], inplace=True)
    X = []
    # Get data as numpy array
    for col in data.columns:
        X.append(sd_log.get_points(col))

    # Normalize the data (y = (x - min) / (max - min))
    data_norm = data.copy()
    for column in data_norm.columns:
        data_norm[column] = (data_norm[column] - data_norm[column].min()) / (
            data_norm[column].max() - data_norm[column].min())

    X = TimeSeriesScalerMinMax().fit_transform(X)
    X = to_time_series_dataset(X)

    #  Find optimal # clusters by
    #  looping through different configurations for # of clusters and store the respective values for silhouette:
    sil_scores = {}
    for n in range(2, len(data.columns)):
        model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10)
        model_tst.fit(X)
        sil_scores[n] = (silhouette_score(X,
                                          model_tst.predict(X),
                                          metric="dtw"))

    opt_k = max(sil_scores, key=sil_scores.get)
    model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10)
    labels = model.fit_predict(X)
    print(labels)

    # build helper df to map metrics to their cluster labels
    df_cluster = pd.DataFrame(list(zip(data.columns, model.labels_)),
                              columns=['metric', 'cluster'])

    # make some helper dictionaries and lists
    cluster_metrics_dict = df_cluster.groupby(
        ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()
    cluster_len_dict = df_cluster['cluster'].value_counts().to_dict()
    clusters_dropped = [
        cluster for cluster in cluster_len_dict
        if cluster_len_dict[cluster] == 1
    ]
    clusters_final = [
        cluster for cluster in cluster_len_dict
        if cluster_len_dict[cluster] > 1
    ]

    print('Plotting Clusters')

    fig, axs = plt.subplots(opt_k)  # , figsize=(10, 5))
    # fig.suptitle('Clusters')
    row_i = 0
    # column_j = 0
    # For each label there is,
    # plots every series with that label
    for cluster in cluster_metrics_dict:
        for feat in cluster_metrics_dict[cluster]:
            axs[row_i].plot(data_norm[feat], label=feat, alpha=0.4)
            axs[row_i].legend(loc="best")
        if len(cluster_metrics_dict[cluster]) > 100:
            # TODO draw mean in red if more than one cluster
            tmp = np.nanmean(np.vstack(cluster), axis=1)
            axs[row_i].plot(tmp, c="red")
        axs[row_i].set_title("Cluster " + str(cluster))
        row_i += 1
        # column_j += 1
        # if column_j % k == 0:
        #    row_i += 1
        #    column_j = 0
    plt.show()

    # return dict {cluster_id: features}
    return cluster_metrics_dict
Beispiel #11
0
def main(argv):
    # define global timer to obtain global execution time
    start_global = timer()
    
    # define globals variables
    global euclidean_clustered_data, \
        dtw_clustered_data, \
        soft_dtw_clustered_data, \
        k_shape_clustered_data, \
        gak_clustered_data
    
    #############################################################################################
    # Input arguments parsing
    #############################################################################################
    
    # define help message
    help_message = \
        'clustering.py -h \n\n' \
        'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \
        'by default: processing input data (without any sampling)' \
        '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \
        'options list: \n' \
        '  -c / --clusters <number_clusters>  # set number of clusters (default 3) \n\n' \
        '  -i / --ifile <input_file>          # set input filename \n' \
        '  -n / --normalise                   # normalise input data \n' \
        '  -s / --standardise                 # standardise input data \n\n' \
        '  -a / --all                         # perform all 5 implemented methods of clustering: \n' \
        '                                       euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \
        '  -E / --euclidean                   # perform euclidean k-means clustering \n' \
        '  -D / --dtw                         # perform dtw k-means clustering \n' \
        '  -S / --soft-dtw                    # perform soft-dtw k-means clustering \n' \
        '  -K / --k-shape                     # perform k-shape clustering \n' \
        '  -G / --gak                         # perform GAK k-means clustering \n'
    
    # Create new object to save arguments
    i_args = Arguments()
    
    # number of rows in plot to create correct number of subplots
    # default = 3 (raw data plus distribution histograms)
    n_rows_plot = 3
    
    # define validation rules for arguments
    try:
        opts, args = getopt.getopt(
            argv,
            "hc:i:nsaEDSKG",
            [
                "help",
                "clusters=",
                "ifile=",
                "normalise",
                "standardise",
                "all",
                "euclidean",
                "dtw",
                "soft-dtw",
                "k-shape",
                "gak"
            ]
        )
    except getopt.GetoptError:
        print(help_message)
        sys.exit(2)
    
    # parse arguments
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print(help_message)
            sys.exit()
        elif opt in ("-c", "--clusters"):
            i_args.number_clusters = arg
        elif opt in ("-i", "--ifile"):
            i_args.input_file = arg
        elif opt in ("-n", "--normalise"):
            i_args.normalise_data = True
        elif opt in ("-s", "--standardise"):
            i_args.standardise_data = True
        elif opt in ("-E", "--euclidean"):
            n_rows_plot += 1
            i_args.euclidean_clustering = True
        elif opt in ("-D", "--dtw"):
            n_rows_plot += 1
            i_args.dtw_clustering = True
        elif opt in ("-S", "--soft-dtw"):
            n_rows_plot += 1
            i_args.soft_dtw_clustering = True
        elif opt in ("-K", "--k-shape"):
            n_rows_plot += 1
            i_args.k_shape_clustering = True
        elif opt in ("-G", "--gak"):
            n_rows_plot += 1
            i_args.gak_clustering = True
        elif opt in ("-a", "--all"):
            n_rows_plot = 8
            i_args.euclidean_clustering = True
            i_args.dtw_clustering = True
            i_args.soft_dtw_clustering = True
            i_args.k_shape_clustering = True
            i_args.gak_clustering = True
    
    # normalise maximum number of subplots levels
    n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot
    
    #############################################################################################
    # Raw data processing stage
    #############################################################################################
    
    # set style to matplotlib plot
    mpl.style.use('seaborn')
    
    # set seed value and seed the generator
    seed = 0
    numpy.random.seed(seed)
    
    # import data and print first 5 rows
    raw_data = import_data()
    print(raw_data.head())
    
    # convert raw data to the format which can be used by tslearn
    # (3-d dimensional array)
    # BUILT functionality: adjust all time series to one size
    # (NaN values are appended to the shorter ones)
    formatted_data = to_time_series_dataset(raw_data)
    
    # print shape of new array
    print(formatted_data.shape)
    
    # obtain number of measuring
    n_measuring = formatted_data.shape[1]
    
    # define figure, grid_spec to create layout of the plot
    fig = plt.figure(constrained_layout=True)
    grid_spec = fig.add_gridspec(
        n_rows_plot,
        i_args.number_clusters
    )
    
    # set A4 size to figure
    fig.set_size_inches(8.5, 11.75)
    
    # setup count of layers of subplots
    count_layer = 3
    # setup first subplot and draw raw time series
    f_ax_raw_data = fig.add_subplot(grid_spec[:2, :])
    
    for xx in formatted_data:
        f_ax_raw_data.plot(xx.ravel(), alpha=.2)
    
    formatted_data_min = formatted_data.min()
    formatted_data_max = formatted_data.max()
    # draw title for chart with min and max values
    f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max))

    # obtain and print executing time of data processing stage to console,
    timer_tick = get_time_tick(start_global)
    plt.ion()
    plt.show()
    
    print("Raw data processing time: %s" % timer_tick)
    
    #############################################################################################
    # Data preprocessing stage
    #############################################################################################
    
    start = timer()
    
    # Convert NaNs to value predicted by interpolation
    # linearly interpolate for NaN/NaNs
    n_nan_changes = 0
    for ind in range(formatted_data.shape[0]):
        mask = numpy.isnan(formatted_data[ind])
        n_nan_changes += mask.sum()
        formatted_data[ind][mask] = numpy.interp(
            numpy.flatnonzero(mask),
            numpy.flatnonzero(~mask),
            formatted_data[ind][~mask]
        )
    print("%d NaN values was/were interpolated" % n_nan_changes)
    
    # Scaling
    # to know should we use normalization or standardization, we need to see
    # the distribution of values.
    
    # take random 3 measuring for each case to draw histograms
    random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False)
    
    # create new arrays with values of randomly chosen measurements
    histogram_data = formatted_data[:, random_indexes]
    
    # draw histograms
    for i_histogram in range(i_args.number_clusters):
        f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram])
        f_ax_histogram.hist(
            histogram_data[:, i_histogram],
            bins=25, density=True
        )
        
        f_ax_histogram.text(0.55, 0.98,
                            'Measurement #%d' % random_indexes[i_histogram],
                            transform=plt.gca().transAxes,
                            color="navy"
                            )
        if i_histogram == 1:
            preprocessing = ''
            if i_args.normalise_data:
                preprocessing += "normalised"
                if i_args.standardise_data:
                    preprocessing += " and standardised"
            elif i_args.standardise_data:
                preprocessing += "standardised"

            preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing
            f_ax_histogram.set_title(
                "Distributions histograms %s" % preprocessing,
                color='navy', y=1, pad=14
            )
    
    # if no processing data option chosen continue with raw data
    processed_data = formatted_data
    
    # since for this concrete challenge data the distributions are more/less
    # Gaussian/Normal we can use standardization
    
    # normalize data: Min-Max scaling ranging between 0 and 1
    if i_args.normalise_data:
        processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data)
        print("Data was normalised")
    
    # standardize data: scaling technique where the values are centered around
    # the mean with a unit standard deviation
    if i_args.standardise_data:
        processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data)
        print("Data was standardised")
    
    # obtain max value of data (to be used in visualization subplots)
    max_data = processed_data.max() * 1.2
    min_data = processed_data.min() * 1.2
    
    timer_tick = get_time_tick(start)
    print("#############################################################################################")
    print("Data processing stage elapsed time: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Euclidean k-means clustering algorithm
    #############################################################################################
    
    if i_args.euclidean_clustering:
        
        start = timer()
        print("Euclidean k-means")
        
        # define parameters of the model of the algorithm
        k_means_euclidean = TimeSeriesKMeans(
            n_clusters=i_args.number_clusters,
            verbose=True,
            random_state=seed,
            n_jobs=4
        )
        
        # calculate cluster's label array
        euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data)
        
        # draw subplots with attributed clusters of time series as well as
        # cluster centers' lines
        for i_cluster in range(i_args.number_clusters):
            f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                n_measuring, min_data, max_data,
                                                processed_data, euclidean_clustered_data, 'tab:blue')
            
            f_ax_euclidean.plot(
                k_means_euclidean.cluster_centers_[i_cluster].ravel(),
                "tab:green"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_euclidean
        
        # increment count of filled layer of subplots
        count_layer += 1
        
        # obtain processing time, print it to console and
        # add it to the title of the series of subplots
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Euclidean $k$-means (%s)" % timer_tick,
            color='tab:green', y=1, pad=14
        )
        print("#############################################################################################")
        print("Euclidean k-means time processing: %s" % timer_tick)
        
    #############################################################################################
    # Implementing DTW k-means clustering algorithm
    # use dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.dtw_clustering:
        
        start = timer()
        print("DTW k-means")
        k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                       n_init=3,
                                       metric="dtw",
                                       verbose=True,
                                       max_iter_barycenter=10,
                                       random_state=seed,
                                       n_jobs=6
                                       )
        dtw_clustered_data = k_means_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                          n_measuring, min_data, max_data,
                                          processed_data, dtw_clustered_data, 'tab:blue')
            
            f_ax_dtw.plot(
                k_means_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:red"
            )
            if i_cluster == 1:
                middle_axis = f_ax_dtw

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "DTW $k$-means (%s)" % timer_tick,
            color='tab:red', y=1, pad=14
        )
        print("#############################################################################################")
        print("DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing soft DTW k-means clustering algorithm
    # use soft dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.soft_dtw_clustering:
        
        start = timer()
        print("Soft-DTW k-means")
        k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                            metric="softdtw",
                                            metric_params={"gamma": .025},
                                            verbose=True,
                                            random_state=seed,
                                            n_jobs=6
                                            )
        soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                               n_measuring, min_data, max_data,
                                               processed_data, soft_dtw_clustered_data, 'tab:blue')
            
            f_ax_soft_dtw.plot(
                k_means_soft_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:purple"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_soft_dtw

        # increment count of filled layer of subplots
        count_layer += 1

        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Soft-DTW $k$-means (%s)" % timer_tick,
            color='tab:purple', y=1, pad=14
        )
        print("#############################################################################################")
        print("Soft-DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing k-Shape clustering algorithm
    #############################################################################################
    
    if i_args.k_shape_clustering:
        
        start = timer()
        print("K-Shape")
        k_shape = KShape(n_clusters=i_args.number_clusters,
                         verbose=True,
                         random_state=seed
                         )
        k_shape_clustered_data = k_shape.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            
            min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min())
            max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max())
            
            f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                              n_measuring, min_axe_value, max_axe_value,
                                              processed_data, k_shape_clustered_data, 'tab:blue')
            
            f_ax_k_shape.plot(
                k_shape.cluster_centers_[i_cluster].ravel(),
                "tab:orange"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_k_shape

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "$K$-Shape (%s)" % timer_tick,
            color='tab:orange', y=1, pad=14
        )
        print("#############################################################################################")
        print("K-Shape time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Global Alignment kernel k-means clustering algorithm
    # since kernel is used, there is no centroid of the cluster
    #############################################################################################
    
    if i_args.gak_clustering:
        
        start = timer()
        print("GAK-k-means")
        gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters,
                                   kernel="gak",
                                   kernel_params={"sigma": "auto"},
                                   n_init=10,
                                   verbose=True,
                                   random_state=seed,
                                   n_jobs=6
                                   )
        
        gak_clustered_data = gak_k_means.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                  n_measuring, min_data, max_data,
                                                  processed_data, gak_clustered_data, 'tab:blue')
            
            if i_cluster == 1:
                middle_axis = f_ax_gak_k_means

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Global Alignment kernel $k$-means (%s)" % timer_tick,
            color='tab:cyan', y=1, pad=14)
        print("#############################################################################################")
        print("GAK k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    
    # return string with current datetime
    now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

    # define the name of the directory to be created
    path = "./out/%s" % now

    print("#############################################################################################")
    try:
        os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
    else:
        print("Successfully created the directory %s " % path)
    
    try:
        # save figure as pdf to out folder
        fig.savefig("./out/%s/visual_result.pdf" % now)
    
        # save clustering results
        if i_args.euclidean_clustering:
            numpy.savetxt(
                "./out/%s/euclidean_clustering_result.csv" % now,
                euclidean_clustered_data,
                delimiter=","
            )
        if i_args.dtw_clustering:
            numpy.savetxt(
                "./out/%s/dtw_clustering_result.csv" % now,
                dtw_clustered_data,
                delimiter=","
            )
        if i_args.soft_dtw_clustering:
            numpy.savetxt(
                "./out/%s/soft_dtw_clustering_result.csv" % now,
                soft_dtw_clustered_data,
                delimiter=","
            )
        if i_args.k_shape_clustering:
            numpy.savetxt(
                "./out/%s/k_shape_clustering_result.csv" % now,
                k_shape_clustered_data,
                delimiter=","
            )
        if i_args.gak_clustering:
            numpy.savetxt(
                "./out/%s/gak_clustering_result.csv" % now,
                gak_clustered_data,
                delimiter=","
            )
    except RuntimeError:
        print("Saving results failed")
    else:
        print("Successfully saved results in the path %s " % path)

    #############################################################################################
    
    # obtain and print global executing time
    timer_tick = get_time_tick(start_global)
    print("#############################################################################################")
    print("All algorithms elapsed time: % s" % timer_tick)
    
    #############################################################################################

    # render and show plot
    # plt.show()
    plt.draw()
    plt.pause(0.001)
    input("Press [enter] to finish.")
    print("#############################################################################################")
def normalize(ts, ts_err):
    ts /= (ts_err + 1)  # +1 to avoid zero division
    ts = np.nan_to_num(ts)
    ts = TimeSeriesScalerMinMax().fit_transform(ts)
    return ts
Beispiel #13
0
#        if data[i]<minVal:
#           data[i] = minVal
#        elif data[i]>maxVal:
#           data[i] = maxVal
#

#转成tslearn格式
import time
data = ReadDataFromFile()

from tslearn.utils import to_time_series_dataset
formatted_dataset = to_time_series_dataset(list(data.values()))

#归一化
from tslearn.preprocessing import TimeSeriesScalerMinMax
scaler = TimeSeriesScalerMinMax(value_range=(0., 1.))
storeMinMax = []
for i in range(len(formatted_dataset)):
    ele = formatted_dataset[i]
    ele = ele.reshape(ele.shape[0])
    storeMinMax.append((min(ele), max(ele)))
    formatted_dataset[i] = scaler.fit_transform(ele).reshape(
        formatted_dataset[i].shape[0], formatted_dataset[i].shape[1])

#进行聚类

## 目标:将已有的时序数据分成若干类,每个类中有一个中心变量,通过对中心变量的预测可以实现对其他时序数据的预测,从而降低时序数据预测的开销
## 暂时不考虑时序上的位移,只考虑数值上每个类与该类之间存在固定的线性变化,这样通过计算线性关系的算法可以直接算出来
# T1:直接对原始数据进行基于切分的聚类时不可行的,速度太慢。

# 想法一:进行普通归一化。然后以欧式距离/相关性进行聚类,聚类方法为普通K-means递进/DBSCAN/层次聚类
Beispiel #14
0
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.datasets import CachedDatasets

from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

import numpy as np

import matplotlib.pyplot as plt

# Our pipeline consists of two phases. First, data will be normalized using
# min-max normalization. Afterwards, it is fed to a KNN classifier. For the
# KNN classifier, we tune the n_neighbors and weights hyper-parameters.
n_splits = 3
pipeline = GridSearchCV(Pipeline([('normalize', TimeSeriesScalerMinMax()),
                                  ('knn', KNeighborsTimeSeriesClassifier())]),
                        {
                            'knn__n_neighbors': [5, 25],
                            'knn__weights': ['uniform', 'distance']
                        },
                        cv=StratifiedKFold(n_splits=n_splits,
                                           shuffle=True,
                                           random_state=42))

X_train, y_train, _, _ = CachedDatasets().load_dataset("Trace")

# Keep only timeseries of class 1, 2, 3
X_train = X_train[y_train > 0]
y_train = y_train[y_train > 0]
from sklearn.pipeline import Pipeline

from tslearn.generators import random_walk_blobs
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.neighbors import KNeighborsTimeSeriesClassifier, KNeighborsTimeSeries
from tslearn.piecewise import SymbolicAggregateApproximation

numpy.random.seed(0)
n_ts_per_blob, sz, d, n_blobs = 20, 100, 1, 2

# Prepare data
X, y = random_walk_blobs(n_ts_per_blob=n_ts_per_blob,
                         sz=sz,
                         d=d,
                         n_blobs=n_blobs)
scaler = TimeSeriesScalerMinMax(min=0., max=1.)  # Rescale time series
X_scaled = scaler.fit_transform(X)

indices_shuffle = numpy.random.permutation(n_ts_per_blob * n_blobs)
X_shuffle = X_scaled[indices_shuffle]
y_shuffle = y[indices_shuffle]

X_train = X_shuffle[:n_ts_per_blob * n_blobs // 2]
X_test = X_shuffle[n_ts_per_blob * n_blobs // 2:]
y_train = y_shuffle[:n_ts_per_blob * n_blobs // 2]
y_test = y_shuffle[n_ts_per_blob * n_blobs // 2:]

# Nearest neighbor search
knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw")
knn.fit(X_train, y_train)
dists, ind = knn.kneighbors(X_test)
def row_wise_minmax_scaling(x):
    '''
    Takes a 2D array and scales each row to the range of [0.0, 1.0]
    '''
    scaler = TimeSeriesScalerMinMax(value_range=(0.0, 1.0))
    return (scaler.fit_transform(x).squeeze())
    except RuntimeError as e:
        print(e)

# Set a seed to ensure determinism
numpy.random.seed(42)

# Load the Trace dataset
X_train, y_train, _, _ = CachedDatasets().load_dataset("Trace")

# Filter out classes 2 and 4
mask = numpy.isin(y_train, [1, 3])
X_train = X_train[mask]
y_train = y_train[mask]

# Normalize the time series
X_train = TimeSeriesScalerMinMax().fit_transform(X_train)

# Get statistics of the dataset
n_ts, ts_sz = X_train.shape[:2]
n_classes = len(set(y_train))

# We will extract 1 shapelet and align it with a time series
shapelet_sizes = {20: 1}

# Define the model and fit it using the training data
shp_clf = LearningShapelets(n_shapelets_per_size=shapelet_sizes,
                            weight_regularizer=0.001,
                            optimizer=Adam(lr=0.01),
                            max_iter=250,
                            verbose=0,
                            scale=False,
from tslearn.generators import random_walk_blobs
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.neighbors import KNeighborsTimeSeriesClassifier, \
    KNeighborsTimeSeries
from tslearn.piecewise import SymbolicAggregateApproximation

numpy.random.seed(0)
n_ts_per_blob, sz, d, n_blobs = 20, 100, 1, 2

# Prepare data
X, y = random_walk_blobs(n_ts_per_blob=n_ts_per_blob,
                         sz=sz,
                         d=d,
                         n_blobs=n_blobs)
scaler = TimeSeriesScalerMinMax(value_range=(0., 1.))  # Rescale time series
X_scaled = scaler.fit_transform(X)

indices_shuffle = numpy.random.permutation(n_ts_per_blob * n_blobs)
X_shuffle = X_scaled[indices_shuffle]
y_shuffle = y[indices_shuffle]

X_train = X_shuffle[:n_ts_per_blob * n_blobs // 2]
X_test = X_shuffle[n_ts_per_blob * n_blobs // 2:]
y_train = y_shuffle[:n_ts_per_blob * n_blobs // 2]
y_test = y_shuffle[n_ts_per_blob * n_blobs // 2:]

# Nearest neighbor search
knn = KNeighborsTimeSeries(n_neighbors=3, metric="dtw")
knn.fit(X_train, y_train)
dists, ind = knn.kneighbors(X_test)
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                     Hyperparams]):
    """
        Primitive that applies the k nearest neighbor classification algorithm to time series data. 
        The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped.
        
        Training inputs: 1) Feature dataframe, 2) Target dataframe
        Outputs: Dataframe with predictions for specific time series at specific future time instances 
    
        Arguments:
            hyperparams {Hyperparams} -- D3M Hyperparameter object
        
        Keyword Arguments:
            random_seed {int} -- random seed (default: {0})
    """

    metadata = metadata_base.PrimitiveMetadata({
        # Simply an UUID generated once and fixed forever. Generated using "uuid.uuid4()".
        "id":
        "2d6d3223-1b3c-49cc-9ddd-50f571818268",
        "version":
        __version__,
        "name":
        "kanine",
        # Keywords do not have a controlled vocabulary. Authors can put here whatever they find suitable.
        "keywords": [
            "time series",
            "knn",
            "k nearest neighbor",
            "time series classification",
        ],
        "source": {
            "name":
            __author__,
            "contact":
            __contact__,
            "uris": [
                # Unstructured URIs.
                "https://github.com/Yonder-OSS/D3M-Primitives",
            ],
        },
        # A list of dependencies in order. These can be Python packages, system packages, or Docker images.
        # Of course Python packages can also have their own dependencies, but sometimes it is necessary to
        # install a Python package first to be even able to run setup.py of another package. Or you have
        # a dependency which is not on PyPi.
        "installation": [
            {
                "type": "PIP",
                "package": "cython",
                "version": "0.29.14"
            },
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/Yonder-OSS/D3M-Primitives.git@{git_commit}#egg=yonder-primitives"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        # The same path the primitive is registered with entry points in setup.py.
        "python_path":
        "d3m.primitives.time_series_classification.k_neighbors.Kanine",
        # Choose these from a controlled vocabulary in the schema. If anything is missing which would
        # best describe the primitive, make a merge request.
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._knn = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.hyperparams["n_neighbors"],
            metric=self.hyperparams["distance_metric"],
            weights=self.hyperparams["sample_weighting"],
        )
        self._scaler = TimeSeriesScalerMinMax()
        self._is_fit = False

    def get_params(self) -> Params:
        if not self._is_fit:
            return Params(scaler=None, classifier=None, output_columns=None)

        return Params(scaler=self._scaler,
                      classifier=self._knn,
                      output_columns=self._output_columns)

    def set_params(self, *, params: Params) -> None:
        self._scaler = params['scaler']
        self._knn = params['classifier']
        self._output_columns = params['output_columns']
        self._is_fit = all(param is not None for param in params.values())

    def _get_cols(self, input_metadata):
        """ private util function that finds grouping column from input metadata
        
        Arguments:
            input_metadata {D3M Metadata object} -- D3M Metadata object for input frame
        
        Returns:
            list[int] -- list of column indices annotated with GroupingKey metadata
        """

        # find column with ts value through metadata
        grouping_column = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/GroupingKey", ))
        return grouping_column

    def _get_value_col(self, input_metadata):
        """
        private util function that finds the value column from input metadata

        Arguments:
        input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

        Returns:
        int -- index of column that contains time series value after Time Series Formatter primitive
        """

        # find attribute column but not file column
        attributes = input_metadata.list_columns_with_semantic_types(
            ('https://metadata.datadrivendiscovery.org/types/Attribute', ))
        # this is assuming alot, but timeseries formaters typicaly place value column at the end
        attribute_col = attributes[-1]
        return attribute_col

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """ Sets primitive's training data

            Arguments:
                inputs {Inputs} -- D3M dataframe containing attributes
                outputs {Outputs} -- D3M dataframe containing targets
        """

        # load and reshape training data
        self._output_columns = outputs.columns
        outputs = np.array(outputs)
        n_ts = outputs.shape[0]
        ts_sz = inputs.shape[0] // n_ts

        attribute_col = self._get_value_col(inputs.metadata)
        self._X_train = inputs.iloc[:,
                                    attribute_col].values.reshape(n_ts, ts_sz)
        self._y_train = np.array(outputs).reshape(-1, )

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """ Fits KNN model using training data from set_training_data and hyperparameters
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})
            
            Returns:
                CallResult[None]
        """

        scaled = self._scaler.fit_transform(self._X_train)
        self._knn.fit(scaled, self._y_train)
        self._is_fit = True
        return CallResult(None, has_finished=self._is_fit)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """ Produce primitive's classifications for new time series data

            Arguments:
                inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target
            
            Keyword Arguments:
                timeout {float} -- timeout, not considered (default: {None})
                iterations {int} -- iterations, not considered (default: {None})

            Raises:
                PrimitiveNotFittedError: if primitive not fit

            Returns:
                CallResult[Outputs] -- dataframe with a column containing a predicted class 
                    for each input time series
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        # find column with ts value through metadata
        grouping_column = self._get_cols(inputs.metadata)

        n_ts = inputs.iloc[:, grouping_column[0]].nunique()
        ts_sz = inputs.shape[0] // n_ts
        attribute_col = self._get_value_col(inputs.metadata)
        x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz)

        # make predictions
        scaled = self._scaler.transform(x_vals)
        preds = self._knn.predict(scaled)

        # create output frame
        result_df = container.DataFrame({self._output_columns[0]: preds},
                                        generate_metadata=True)
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )

        return CallResult(result_df, has_finished=True)
def get_color(weights):
    baselines = numpy.zeros((4, 3))
    weights = numpy.array(weights).reshape(1, 4)
    for i, c in enumerate(["r", "g", "b", "y"]):
        baselines[i] = matplotlib.colors.ColorConverter().to_rgb(c)
    return numpy.dot(weights, baselines).ravel()

numpy.random.seed(0)
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_out = numpy.empty((4, X_train.shape[1], X_train.shape[2]))

plt.figure()
for i in range(4):
    X_out[i] = X_train[y_train == (i + 1)][0]
X_out = TimeSeriesScalerMinMax().fit_transform(X_out)

for i, pos in enumerate([1, 5, 21, 25]):
    plt.subplot(5, 5, pos)
    w = [0.] * 4
    w[i] = 1.
    plt.plot(X_out[i].ravel(),
             color=matplotlib.colors.rgb2hex(get_color(w)),
             linewidth=2)
    plt.text(X_out[i].shape[0], 0., "$X_%d$" % i,
             horizontalalignment="right",
             verticalalignment="baseline",
             fontsize=24)
    plt.xticks([])
    plt.yticks([])
Beispiel #21
0
"""
from __future__ import print_function

# Author: Romain Tavenard
# License: BSD 3 clause

import numpy
import matplotlib.pyplot as plt

from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.svm import TimeSeriesSVC

numpy.random.seed(0)
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = TimeSeriesScalerMinMax().fit_transform(X_train)
X_test = TimeSeriesScalerMinMax().fit_transform(X_test)

clf = TimeSeriesSVC(kernel="gak",
                    gamma=.1,
                    sz=X_train.shape[1],
                    d=X_train.shape[2])
clf.fit(X_train, y_train)
print(("Correct classification rate:", clf.score(X_test, y_test)))

n_classes = len(set(y_train))

plt.figure()
support_vectors = clf.support_vectors_time_series_(X_train)
for i, cl in enumerate(set(y_train)):
    plt.subplot(n_classes, 1, i + 1)
Beispiel #22
0
len(first_train)
from toolz.itertoolz import sliding_window, partition
#for every day of the train set store the flow observations 
days_first=list(partition(48,first_train))
days_first
len(days_first)
#from list to multidimensional array 
days_first=np.asarray(days_first)
days_first
from tslearn.utils import to_time_series, to_time_series_dataset
#create univariate series for normalized flow_observation 
first_time_series = to_time_series(days_first)
print(first_time_series.shape)
#normalize time series
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, TimeSeriesScalerMinMax
first_time_series = TimeSeriesScalerMinMax(value_range=(0.0, 1.0)).fit_transform(first_time_series)
#first_time_series = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(first_time_series)
print(first_time_series.shape)


#treatment of the second variable 
second_train=df.loc[:,'Density']
second_train=np.array(second_train)
second_train= second_train.reshape((len(second_train), 1))
#from array to list 
second_train=second_train.tolist()
len(second_train)
#for every day of the train set store the flow observations 
days_second=list(partition(48,second_train))
days_second
len(days_second)
Beispiel #23
0
class KaninePrimitive(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params,
                                                     Hyperparams]):
    """
    Primitive that applies the k nearest neighbor classification algorithm to time series data.
    The tslearn KNeighborsTimeSeriesClassifier implementation is wrapped.
    """

    metadata = metadata_base.PrimitiveMetadata({
        "id":
        "2d6d3223-1b3c-49cc-9ddd-50f571818268",
        "version":
        __version__,
        "name":
        "kanine",
        "keywords": [
            "time series",
            "knn",
            "k nearest neighbor",
            "time series classification",
        ],
        "source": {
            "name": __author__,
            "contact": __contact__,
            "uris": [
                "https://github.com/kungfuai/d3m-primitives",
            ],
        },
        "installation": [
            {
                "type": "PIP",
                "package": "cython",
                "version": "0.29.16"
            },
            {
                "type":
                metadata_base.PrimitiveInstallationType.PIP,
                "package_uri":
                "git+https://github.com/kungfuai/d3m-primitives.git@{git_commit}#egg=kf-d3m-primitives"
                .format(git_commit=utils.current_git_commit(
                    os.path.dirname(__file__)), ),
            },
        ],
        "python_path":
        "d3m.primitives.time_series_classification.k_neighbors.Kanine",
        "algorithm_types": [
            metadata_base.PrimitiveAlgorithmType.K_NEAREST_NEIGHBORS,
        ],
        "primitive_family":
        metadata_base.PrimitiveFamily.TIME_SERIES_CLASSIFICATION,
    })

    def __init__(self,
                 *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed)

        self._knn = KNeighborsTimeSeriesClassifier(
            n_neighbors=self.hyperparams["n_neighbors"],
            metric=self.hyperparams["distance_metric"],
            weights=self.hyperparams["sample_weighting"],
        )
        self._scaler = TimeSeriesScalerMinMax()
        self._is_fit = False

    def get_params(self) -> Params:
        if not self._is_fit:
            return Params(scaler=None, classifier=None, output_columns=None)

        return Params(
            scaler=self._scaler,
            classifier=self._knn,
            output_columns=self._output_columns,
        )

    def set_params(self, *, params: Params) -> None:
        self._scaler = params["scaler"]
        self._knn = params["classifier"]
        self._output_columns = params["output_columns"]
        self._is_fit = all(param is not None for param in params.values())

    def _get_cols(self, input_metadata):
        """private util function that finds grouping column from input metadata

        Arguments:
            input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

        Returns:
            list[int] -- list of column indices annotated with GroupingKey metadata
        """

        # find column with ts value through metadata
        grouping_column = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/GroupingKey", ))
        return grouping_column

    def _get_value_col(self, input_metadata):
        """
        private util function that finds the value column from input metadata

        Arguments:
        input_metadata {D3M Metadata object} -- D3M Metadata object for input frame

        Returns:
        int -- index of column that contains time series value after Time Series Formatter primitive
        """

        # find attribute column but not file column
        attributes = input_metadata.list_columns_with_semantic_types(
            ("https://metadata.datadrivendiscovery.org/types/Attribute", ))
        # this is assuming alot, but timeseries formaters typicaly place value column at the end
        attribute_col = attributes[-1]
        return attribute_col

    def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None:
        """Sets primitive's training data

        Arguments:
            inputs {Inputs} -- D3M dataframe containing attributes
            outputs {Outputs} -- D3M dataframe containing targets
        """

        # load and reshape training data
        self._output_columns = outputs.columns
        outputs = np.array(outputs)
        n_ts = outputs.shape[0]
        ts_sz = inputs.shape[0] // n_ts

        attribute_col = self._get_value_col(inputs.metadata)
        self._X_train = inputs.iloc[:,
                                    attribute_col].values.reshape(n_ts, ts_sz)
        self._y_train = np.array(outputs).reshape(-1, )

    def fit(self,
            *,
            timeout: float = None,
            iterations: int = None) -> CallResult[None]:
        """Fits KNN model using training data from set_training_data and hyperparameters

        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, not considered (default: {None})

        Returns:
            CallResult[None]
        """

        scaled = self._scaler.fit_transform(self._X_train)
        self._knn.fit(scaled, self._y_train)
        self._is_fit = True
        return CallResult(None, has_finished=self._is_fit)

    def produce(self,
                *,
                inputs: Inputs,
                timeout: float = None,
                iterations: int = None) -> CallResult[Outputs]:
        """Produce primitive's classifications for new time series data

        Arguments:
            inputs {Inputs} -- full D3M dataframe, containing attributes, key, and target

        Keyword Arguments:
            timeout {float} -- timeout, not considered (default: {None})
            iterations {int} -- iterations, not considered (default: {None})

        Raises:
            PrimitiveNotFittedError: if primitive not fit

        Returns:
            CallResult[Outputs] -- dataframe with a column containing a predicted class
                for each input time series
        """

        if not self._is_fit:
            raise PrimitiveNotFittedError("Primitive not fitted.")

        # find column with ts value through metadata
        grouping_column = self._get_cols(inputs.metadata)

        n_ts = inputs.iloc[:, grouping_column[0]].nunique()
        ts_sz = inputs.shape[0] // n_ts
        attribute_col = self._get_value_col(inputs.metadata)
        x_vals = inputs.iloc[:, attribute_col].values.reshape(n_ts, ts_sz)

        # make predictions
        scaled = self._scaler.transform(x_vals)
        preds = self._knn.predict(scaled)

        # create output frame
        result_df = container.DataFrame({self._output_columns[0]: preds},
                                        generate_metadata=True)
        result_df.metadata = result_df.metadata.add_semantic_type(
            (metadata_base.ALL_ELEMENTS, 0),
            ("https://metadata.datadrivendiscovery.org/types/PredictedTarget"),
        )

        return CallResult(result_df, has_finished=True)
def subseqeuence_clustering(sequence, changepoints, y_label='y', norm=False):
    """
    Clusters subsequences of time series indicated by the changepoints variable.
    Uses silhouette score to determine the number of clusters
    :param y_label: Name of y-label in plot
    :param norm: normlise data using MinMaxScaler
    :param sequence: np array of the time series
    :param changepoints: detected changepoints on which subseuqences are build
    :return:
    """
    from tslearn.clustering import TimeSeriesKMeans, silhouette_score
    from tslearn.utils import to_time_series_dataset
    from tslearn.preprocessing import TimeSeriesScalerMinMax

    sub_ids = []
    x_index = []
    X = []
    i = 0
    end_p = [len(sequence) - 1]
    for cp in changepoints + end_p:
        X.append(sequence[i:cp])
        index = 'sub_' + str(i) + '_' + str(cp)
        sub_ids.append(index)
        x_index.append([x_id for x_id in range(i, cp + 1)])
        i = cp

    # Normalize the data (y = (x - min) / (max - min))
    if norm:
        X = TimeSeriesScalerMinMax().fit_transform(X)
    X = to_time_series_dataset(X)
    #  Find optimal # clusters by
    #  looping through different configurations for # of clusters and store the respective values for silhouette:
    sil_scores = {}
    for n in range(2, len(changepoints)):
        model_tst = TimeSeriesKMeans(n_clusters=n, metric="dtw", n_init=10)
        model_tst.fit(X)
        sil_scores[n] = (silhouette_score(X,
                                          model_tst.predict(X),
                                          metric="dtw"))

    opt_k = max(sil_scores, key=sil_scores.get)
    print('Number of Clusters in subsequence clustering: ' + str(opt_k))
    model = TimeSeriesKMeans(n_clusters=opt_k, metric="dtw", n_init=10)
    labels = model.fit_predict(X)
    print(labels)

    # build helper df to map metrics to their cluster labels
    df_cluster = pd.DataFrame(list(zip(sub_ids, x_index, model.labels_)),
                              columns=['metric', 'x_index', 'cluster'])
    cluster_metrics_dict = df_cluster.groupby(
        ['cluster'])['metric'].apply(lambda x: [x for x in x]).to_dict()

    print('Plotting Clusters')
    #  plot changepoints as vertical lines
    for cp in changepoints:
        plt.axvline(x=cp, ls=':', lw=2, c='0.65')
    #  preprocessing for plotting cluster based
    x_scat = []
    y_scat = []
    cluster = []
    for index, row in df_cluster.iterrows():
        x_seq = row['x_index']
        x_scat.extend(x_seq)
        y_seq = sequence[x_seq[0]:x_seq[-1] + 1]
        y_scat.extend(y_seq)
        label_seq = [row['cluster']]
        cluster.extend(label_seq * len(x_seq))
        # plt.scatter(x_seq, y_seq, label=label_seq)
    # plotting cluster based
    x_scat = np.array(x_scat)
    y_scat = np.array(y_scat)
    for c in np.unique(cluster):
        i = np.where(cluster == c)
        plt.scatter(x_scat[i], y_scat[i], label=c)
    plt.legend()
    plt.title('Subsequence k-means Clustering')
    plt.xlabel('Time index')
    plt.ylabel(y_label)
    plt.show()

    return cluster_metrics_dict
Beispiel #25
0
def tsScale(ts):
    tsc=TimeSeriesScalerMinMax(value_range=(-1,1))
    scaled_ts=tsc.fit_transform(ts)
    return scaled_ts