Python KernelKMeans Examples

Programming Language: Python

Namespace/Package Name: tslearn.clustering

Class/Type: KernelKMeans

Examples at hotexamples.com: 9

Python KernelKMeans - 9 examples found. These are the top rated real world Python examples of tslearn.clustering.KernelKMeans extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

KernelKMeans(7)

fit(3)

fit_predict(2)

predict(2)

Example #1

Show file

def test_serialize_global_alignment_kernel_kmeans():
    n, sz, d = 15, 10, 3
    rng = numpy.random.RandomState(0)
    X = rng.randn(n, sz, d)

    gak_km = KernelKMeans(n_clusters=3, verbose=False,
                          max_iter=5)

    _check_not_fitted(gak_km)

    gak_km.fit(X)

    _check_params_predict(gak_km, X, ['predict'])

Example #2

Show file

def test_variable_length_clustering():
    # TODO: here we just check that they can accept variable-length TS, not
    # that they do clever things
    X = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9],
                                [3, 5, 6, 7, 8]])
    rng = np.random.RandomState(0)

    clf = KernelKMeans(n_clusters=2, random_state=rng)
    clf.fit(X)

    clf = TimeSeriesKMeans(n_clusters=2, metric="dtw", random_state=rng)
    clf.fit(X)

    clf = TimeSeriesKMeans(n_clusters=2, metric="softdtw", random_state=rng)
    clf.fit(X)

Example #3

Show file

File: test_clustering.py Project: TheVinhLuong102/tslearn

def test_kernel_kmeans():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)

    gak_km = KernelKMeans(n_clusters=3,
                          verbose=False,
                          max_iter=5,
                          random_state=rng).fit(time_series)
    np.testing.assert_allclose(gak_km.labels_, gak_km.predict(time_series))

    gak_km = KernelKMeans(n_clusters=101,
                          verbose=False,
                          max_iter=5,
                          random_state=rng).fit(time_series)
    assert gak_km._X_fit is None

    gak_km = KernelKMeans(n_clusters=2,
                          verbose=False,
                          kernel="rbf",
                          kernel_params={
                              "gamma": 1.
                          },
                          max_iter=5,
                          random_state=rng).fit(time_series)
    assert gak_km.sigma_gak_ is None

Example #4

Show file

 def single_clustering(self, data_raw, data_new, centroid_num, model):
     """
     单次聚类
     :return:
     """
     seed = 0
     np.random.seed(seed)
     labels = []
     inertia = []
     centers = []
     if model == 'K-Means':
         kmeans = KMeans(n_clusters=centroid_num).fit(data_new)
         labels = kmeans.labels_
         centers = kmeans.cluster_centers_
         # 每个点到其簇的质心的距离之和，越小越好
         inertia = kmeans.inertia_
     elif model == 'DTW':
         sdtw_km = TimeSeriesKMeans(n_clusters=centroid_num,
                                    metric='softdtw',
                                    max_iter=2,
                                    max_iter_barycenter=2,
                                    metric_params={
                                        "gamma": 1.0
                                    },
                                    random_state=0,
                                    verbose=True).fit(data_new)
         labels = sdtw_km.labels_
         centers = sdtw_km.cluster_centers_
         inertia = sdtw_km.inertia_
     elif model == "K-Shape":
         ks = KShape(n_clusters=centroid_num,
                     verbose=True,
                     random_state=seed).fit(data_new)
         labels = ks.labels_
         centers = ks.cluster_centers_
         inertia = ks.inertia_
     elif model == "Kernel-KMeans":
         data_new = data_new[:100]
         data_raw = data_raw[:100]
         kk = KernelKMeans(n_clusters=centroid_num,
                           kernel="gak",
                           kernel_params={
                               "sigma": "auto"
                           },
                           max_iter=2,
                           tol=1e-4,
                           verbose=True).fit(data_new)
         labels = kk.labels_
         inertia = kk.inertia_
     D = {}
     for i in range(centroid_num):
         D[i] = []
     for i in range(len(data_raw)):
         D[labels[i]].append(data_raw[i])
     return inertia, D, centers

Example #5

Show file

def cluster(num_domains, diff_kernel, min_seg_size, seg_numdomians_ratio,
            distance_matrix, min_domain_size, clustering_method, alpha_helices,
            max_alpha_helix_size_to_merge):
    try:

        if clustering_method == 'spectral':
            clustering = SpectralClustering(n_clusters=num_domains,
                                            assign_labels="kmeans",
                                            random_state=0,
                                            affinity='precomputed',
                                            n_init=100).fit(diff_kernel)
        elif clustering_method == 'kernel-kmeans':
            clustering = KernelKMeans(n_clusters=num_domains,
                                      random_state=0,
                                      n_init=100,
                                      kernel='precomputed').fit(diff_kernel)

        labels = clustering.labels_.copy()

        for alpha_helix in alpha_helices:
            if alpha_helix[1] - alpha_helix[
                    0] + 1 <= max_alpha_helix_size_to_merge:
                alpha_helix_labels = labels[alpha_helix[0]:alpha_helix[1] + 1]
                counter = collections.Counter(alpha_helix_labels)
                if len(counter) > 1:
                    most_common = counter.most_common(1)[0][0]
                    labels[alpha_helix[0]:alpha_helix[1] +
                           1] = [most_common
                                 ] * (alpha_helix[1] - alpha_helix[0] + 1)

        remove_short_segments(labels, min_seg_size, distance_matrix)

        remove_redundant_segments(labels, num_domains, seg_numdomians_ratio,
                                  distance_matrix)

        if (len(set(labels)) < num_domains):
            return 'error'

        sil_score = silhouette_score(distance_matrix,
                                     labels=labels,
                                     metric="precomputed")

        for label in set(labels):
            if np.count_nonzero(labels == label) < min_domain_size:
                return 'error'

        return labels, labels, sil_score

    except:
        return 'error'

Example #6

Show file

    def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Fit time series clusterer to training data.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Training time series instances to cluster.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        self:
            Fitted estimator.
        """
        from tslearn.clustering import KernelKMeans as TsLearnKernelKMeans

        verbose = 0
        if self.verbose is True:
            verbose = 1

        if self._tslearn_kernel_k_means is None:
            self._tslearn_kernel_k_means = TsLearnKernelKMeans(
                n_clusters=self.n_clusters,
                kernel=self.kernel,
                max_iter=self.max_iter,
                tol=self.tol,
                n_init=self.n_init,
                kernel_params=self.kernel_params,
                n_jobs=self.n_jobs,
                verbose=verbose,
                random_state=self.random_state,
            )
        self._tslearn_kernel_k_means.fit(X)
        self.labels_ = self._tslearn_kernel_k_means.labels_
        self.inertia_ = self._tslearn_kernel_k_means.inertia_
        self.n_iter_ = self._tslearn_kernel_k_means.n_iter_

Example #7

Show file

def main(argv):
    # define global timer to obtain global execution time
    start_global = timer()
    
    # define globals variables
    global euclidean_clustered_data, \
        dtw_clustered_data, \
        soft_dtw_clustered_data, \
        k_shape_clustered_data, \
        gak_clustered_data
    
    #############################################################################################
    # Input arguments parsing
    #############################################################################################
    
    # define help message
    help_message = \
        'clustering.py -h \n\n' \
        'usage: clustering.py [-c <number_clusters>] [-i <input_file>] [-ansEDSKG] \n' \
        'by default: processing input data (without any sampling)' \
        '(euclidean, dtw, soft-dtw and GAK k-means, k-shape)\n' \
        'options list: \n' \
        '  -c / --clusters <number_clusters>  # set number of clusters (default 3) \n\n' \
        '  -i / --ifile <input_file>          # set input filename \n' \
        '  -n / --normalise                   # normalise input data \n' \
        '  -s / --standardise                 # standardise input data \n\n' \
        '  -a / --all                         # perform all 5 implemented methods of clustering: \n' \
        '                                       euclidean, dtw, soft-dtw, gak k-means and k-shape\n' \
        '  -E / --euclidean                   # perform euclidean k-means clustering \n' \
        '  -D / --dtw                         # perform dtw k-means clustering \n' \
        '  -S / --soft-dtw                    # perform soft-dtw k-means clustering \n' \
        '  -K / --k-shape                     # perform k-shape clustering \n' \
        '  -G / --gak                         # perform GAK k-means clustering \n'
    
    # Create new object to save arguments
    i_args = Arguments()
    
    # number of rows in plot to create correct number of subplots
    # default = 3 (raw data plus distribution histograms)
    n_rows_plot = 3
    
    # define validation rules for arguments
    try:
        opts, args = getopt.getopt(
            argv,
            "hc:i:nsaEDSKG",
            [
                "help",
                "clusters=",
                "ifile=",
                "normalise",
                "standardise",
                "all",
                "euclidean",
                "dtw",
                "soft-dtw",
                "k-shape",
                "gak"
            ]
        )
    except getopt.GetoptError:
        print(help_message)
        sys.exit(2)
    
    # parse arguments
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            print(help_message)
            sys.exit()
        elif opt in ("-c", "--clusters"):
            i_args.number_clusters = arg
        elif opt in ("-i", "--ifile"):
            i_args.input_file = arg
        elif opt in ("-n", "--normalise"):
            i_args.normalise_data = True
        elif opt in ("-s", "--standardise"):
            i_args.standardise_data = True
        elif opt in ("-E", "--euclidean"):
            n_rows_plot += 1
            i_args.euclidean_clustering = True
        elif opt in ("-D", "--dtw"):
            n_rows_plot += 1
            i_args.dtw_clustering = True
        elif opt in ("-S", "--soft-dtw"):
            n_rows_plot += 1
            i_args.soft_dtw_clustering = True
        elif opt in ("-K", "--k-shape"):
            n_rows_plot += 1
            i_args.k_shape_clustering = True
        elif opt in ("-G", "--gak"):
            n_rows_plot += 1
            i_args.gak_clustering = True
        elif opt in ("-a", "--all"):
            n_rows_plot = 8
            i_args.euclidean_clustering = True
            i_args.dtw_clustering = True
            i_args.soft_dtw_clustering = True
            i_args.k_shape_clustering = True
            i_args.gak_clustering = True
    
    # normalise maximum number of subplots levels
    n_rows_plot = 8 if n_rows_plot > 8 else n_rows_plot
    
    #############################################################################################
    # Raw data processing stage
    #############################################################################################
    
    # set style to matplotlib plot
    mpl.style.use('seaborn')
    
    # set seed value and seed the generator
    seed = 0
    numpy.random.seed(seed)
    
    # import data and print first 5 rows
    raw_data = import_data()
    print(raw_data.head())
    
    # convert raw data to the format which can be used by tslearn
    # (3-d dimensional array)
    # BUILT functionality: adjust all time series to one size
    # (NaN values are appended to the shorter ones)
    formatted_data = to_time_series_dataset(raw_data)
    
    # print shape of new array
    print(formatted_data.shape)
    
    # obtain number of measuring
    n_measuring = formatted_data.shape[1]
    
    # define figure, grid_spec to create layout of the plot
    fig = plt.figure(constrained_layout=True)
    grid_spec = fig.add_gridspec(
        n_rows_plot,
        i_args.number_clusters
    )
    
    # set A4 size to figure
    fig.set_size_inches(8.5, 11.75)
    
    # setup count of layers of subplots
    count_layer = 3
    # setup first subplot and draw raw time series
    f_ax_raw_data = fig.add_subplot(grid_spec[:2, :])
    
    for xx in formatted_data:
        f_ax_raw_data.plot(xx.ravel(), alpha=.2)
    
    formatted_data_min = formatted_data.min()
    formatted_data_max = formatted_data.max()
    # draw title for chart with min and max values
    f_ax_raw_data.set_title('Raw Data (min = %.2f, max = %.2f)' %(formatted_data_min, formatted_data_max))

    # obtain and print executing time of data processing stage to console,
    timer_tick = get_time_tick(start_global)
    plt.ion()
    plt.show()
    
    print("Raw data processing time: %s" % timer_tick)
    
    #############################################################################################
    # Data preprocessing stage
    #############################################################################################
    
    start = timer()
    
    # Convert NaNs to value predicted by interpolation
    # linearly interpolate for NaN/NaNs
    n_nan_changes = 0
    for ind in range(formatted_data.shape[0]):
        mask = numpy.isnan(formatted_data[ind])
        n_nan_changes += mask.sum()
        formatted_data[ind][mask] = numpy.interp(
            numpy.flatnonzero(mask),
            numpy.flatnonzero(~mask),
            formatted_data[ind][~mask]
        )
    print("%d NaN values was/were interpolated" % n_nan_changes)
    
    # Scaling
    # to know should we use normalization or standardization, we need to see
    # the distribution of values.
    
    # take random 3 measuring for each case to draw histograms
    random_indexes = numpy.random.choice(n_measuring, i_args.number_clusters, replace=False)
    
    # create new arrays with values of randomly chosen measurements
    histogram_data = formatted_data[:, random_indexes]
    
    # draw histograms
    for i_histogram in range(i_args.number_clusters):
        f_ax_histogram = fig.add_subplot(grid_spec[2, i_histogram])
        f_ax_histogram.hist(
            histogram_data[:, i_histogram],
            bins=25, density=True
        )
        
        f_ax_histogram.text(0.55, 0.98,
                            'Measurement #%d' % random_indexes[i_histogram],
                            transform=plt.gca().transAxes,
                            color="navy"
                            )
        if i_histogram == 1:
            preprocessing = ''
            if i_args.normalise_data:
                preprocessing += "normalised"
                if i_args.standardise_data:
                    preprocessing += " and standardised"
            elif i_args.standardise_data:
                preprocessing += "standardised"

            preprocessing = '' if preprocessing == '' else "(data will be %s)" % preprocessing
            f_ax_histogram.set_title(
                "Distributions histograms %s" % preprocessing,
                color='navy', y=1, pad=14
            )
    
    # if no processing data option chosen continue with raw data
    processed_data = formatted_data
    
    # since for this concrete challenge data the distributions are more/less
    # Gaussian/Normal we can use standardization
    
    # normalize data: Min-Max scaling ranging between 0 and 1
    if i_args.normalise_data:
        processed_data = TimeSeriesScalerMinMax().fit_transform(processed_data)
        print("Data was normalised")
    
    # standardize data: scaling technique where the values are centered around
    # the mean with a unit standard deviation
    if i_args.standardise_data:
        processed_data = TimeSeriesScalerMeanVariance().fit_transform(processed_data)
        print("Data was standardised")
    
    # obtain max value of data (to be used in visualization subplots)
    max_data = processed_data.max() * 1.2
    min_data = processed_data.min() * 1.2
    
    timer_tick = get_time_tick(start)
    print("#############################################################################################")
    print("Data processing stage elapsed time: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Euclidean k-means clustering algorithm
    #############################################################################################
    
    if i_args.euclidean_clustering:
        
        start = timer()
        print("Euclidean k-means")
        
        # define parameters of the model of the algorithm
        k_means_euclidean = TimeSeriesKMeans(
            n_clusters=i_args.number_clusters,
            verbose=True,
            random_state=seed,
            n_jobs=4
        )
        
        # calculate cluster's label array
        euclidean_clustered_data = k_means_euclidean.fit_predict(processed_data)
        
        # draw subplots with attributed clusters of time series as well as
        # cluster centers' lines
        for i_cluster in range(i_args.number_clusters):
            f_ax_euclidean = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                n_measuring, min_data, max_data,
                                                processed_data, euclidean_clustered_data, 'tab:blue')
            
            f_ax_euclidean.plot(
                k_means_euclidean.cluster_centers_[i_cluster].ravel(),
                "tab:green"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_euclidean
        
        # increment count of filled layer of subplots
        count_layer += 1
        
        # obtain processing time, print it to console and
        # add it to the title of the series of subplots
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Euclidean $k$-means (%s)" % timer_tick,
            color='tab:green', y=1, pad=14
        )
        print("#############################################################################################")
        print("Euclidean k-means time processing: %s" % timer_tick)
        
    #############################################################################################
    # Implementing DTW k-means clustering algorithm
    # use dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.dtw_clustering:
        
        start = timer()
        print("DTW k-means")
        k_means_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                       n_init=3,
                                       metric="dtw",
                                       verbose=True,
                                       max_iter_barycenter=10,
                                       random_state=seed,
                                       n_jobs=6
                                       )
        dtw_clustered_data = k_means_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                          n_measuring, min_data, max_data,
                                          processed_data, dtw_clustered_data, 'tab:blue')
            
            f_ax_dtw.plot(
                k_means_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:red"
            )
            if i_cluster == 1:
                middle_axis = f_ax_dtw

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "DTW $k$-means (%s)" % timer_tick,
            color='tab:red', y=1, pad=14
        )
        print("#############################################################################################")
        print("DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing soft DTW k-means clustering algorithm
    # use soft dtw (Dynamic Time Warping Distance) metric to calculate
    # distance between means
    #############################################################################################
    
    if i_args.soft_dtw_clustering:
        
        start = timer()
        print("Soft-DTW k-means")
        k_means_soft_DTW = TimeSeriesKMeans(n_clusters=i_args.number_clusters,
                                            metric="softdtw",
                                            metric_params={"gamma": .025},
                                            verbose=True,
                                            random_state=seed,
                                            n_jobs=6
                                            )
        soft_dtw_clustered_data = k_means_soft_DTW.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_soft_dtw = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                               n_measuring, min_data, max_data,
                                               processed_data, soft_dtw_clustered_data, 'tab:blue')
            
            f_ax_soft_dtw.plot(
                k_means_soft_DTW.cluster_centers_[i_cluster].ravel(),
                "tab:purple"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_soft_dtw

        # increment count of filled layer of subplots
        count_layer += 1

        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Soft-DTW $k$-means (%s)" % timer_tick,
            color='tab:purple', y=1, pad=14
        )
        print("#############################################################################################")
        print("Soft-DTW k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing k-Shape clustering algorithm
    #############################################################################################
    
    if i_args.k_shape_clustering:
        
        start = timer()
        print("K-Shape")
        k_shape = KShape(n_clusters=i_args.number_clusters,
                         verbose=True,
                         random_state=seed
                         )
        k_shape_clustered_data = k_shape.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            
            min_axe_value = min(min_data, k_shape.cluster_centers_[i_cluster].ravel().min())
            max_axe_value = max(max_data, k_shape.cluster_centers_[i_cluster].ravel().max())
            
            f_ax_k_shape = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                              n_measuring, min_axe_value, max_axe_value,
                                              processed_data, k_shape_clustered_data, 'tab:blue')
            
            f_ax_k_shape.plot(
                k_shape.cluster_centers_[i_cluster].ravel(),
                "tab:orange"
            )
            
            if i_cluster == 1:
                middle_axis = f_ax_k_shape

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "$K$-Shape (%s)" % timer_tick,
            color='tab:orange', y=1, pad=14
        )
        print("#############################################################################################")
        print("K-Shape time processing: %s" % timer_tick)
    
    #############################################################################################
    # Implementing Global Alignment kernel k-means clustering algorithm
    # since kernel is used, there is no centroid of the cluster
    #############################################################################################
    
    if i_args.gak_clustering:
        
        start = timer()
        print("GAK-k-means")
        gak_k_means = KernelKMeans(n_clusters=i_args.number_clusters,
                                   kernel="gak",
                                   kernel_params={"sigma": "auto"},
                                   n_init=10,
                                   verbose=True,
                                   random_state=seed,
                                   n_jobs=6
                                   )
        
        gak_clustered_data = gak_k_means.fit_predict(processed_data)
        
        for i_cluster in range(i_args.number_clusters):
            f_ax_gak_k_means = create_figure_axes(fig, grid_spec, count_layer, i_cluster,
                                                  n_measuring, min_data, max_data,
                                                  processed_data, gak_clustered_data, 'tab:blue')
            
            if i_cluster == 1:
                middle_axis = f_ax_gak_k_means

        # increment count of filled layer of subplots
        count_layer += 1
        
        timer_tick = get_time_tick(start)
        middle_axis.set_title(
            "Global Alignment kernel $k$-means (%s)" % timer_tick,
            color='tab:cyan', y=1, pad=14)
        print("#############################################################################################")
        print("GAK k-means time processing: %s" % timer_tick)
    
    #############################################################################################
    
    # return string with current datetime
    now = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")

    # define the name of the directory to be created
    path = "./out/%s" % now

    print("#############################################################################################")
    try:
        os.mkdir(path)
    except OSError:
        print("Creation of the directory %s failed" % path)
    else:
        print("Successfully created the directory %s " % path)
    
    try:
        # save figure as pdf to out folder
        fig.savefig("./out/%s/visual_result.pdf" % now)
    
        # save clustering results
        if i_args.euclidean_clustering:
            numpy.savetxt(
                "./out/%s/euclidean_clustering_result.csv" % now,
                euclidean_clustered_data,
                delimiter=","
            )
        if i_args.dtw_clustering:
            numpy.savetxt(
                "./out/%s/dtw_clustering_result.csv" % now,
                dtw_clustered_data,
                delimiter=","
            )
        if i_args.soft_dtw_clustering:
            numpy.savetxt(
                "./out/%s/soft_dtw_clustering_result.csv" % now,
                soft_dtw_clustered_data,
                delimiter=","
            )
        if i_args.k_shape_clustering:
            numpy.savetxt(
                "./out/%s/k_shape_clustering_result.csv" % now,
                k_shape_clustered_data,
                delimiter=","
            )
        if i_args.gak_clustering:
            numpy.savetxt(
                "./out/%s/gak_clustering_result.csv" % now,
                gak_clustered_data,
                delimiter=","
            )
    except RuntimeError:
        print("Saving results failed")
    else:
        print("Successfully saved results in the path %s " % path)

    #############################################################################################
    
    # obtain and print global executing time
    timer_tick = get_time_tick(start_global)
    print("#############################################################################################")
    print("All algorithms elapsed time: % s" % timer_tick)
    
    #############################################################################################

    # render and show plot
    # plt.show()
    plt.draw()
    plt.pause(0.001)
    input("Press [enter] to finish.")
    print("#############################################################################################")

Example #8

Show file

true_clusters_known = pd.read_pickle('data/known_true_clusters_ids.pkl')
all_clusters_data = []
cl = 'Tree'
# min_size = 1280
# for
# for ev in true_clusters_known[cl].dropna():
#     e = Event(ev, 0, -1, 'resampled').data.shape[0]
#     if min_size > e:
#         min_size = e
# print(e)

for ev in true_clusters_known[cl].dropna():
    e = Event(ev, 0, -1, 'resampled')
    selected_data = e.res().loc[:, e.data.columns != 'Time (s)']
    all_clusters_data.append(selected_data)

#%%
seed = 0
formatted_dataset = to_time_series_dataset(all_clusters_data)
formatted_dataset[np.isnan(formatted_dataset)] = 0
#%%
X_train = TimeSeriesScalerMeanVariance().fit_transform(formatted_dataset)
gak_km = KernelKMeans(n_clusters=3,
                      kernel="gak",
                      kernel_params={"sigma": "auto"},
                      n_init=20,
                      verbose=True,
                      random_state=seed)
y_pred = gak_km.fit_predict(X_train)
#%%

Example #9

Show file

class TimeSeriesKernelKMeans(BaseClusterer):
    """Kernel algorithm wrapper tslearns implementation.

    Parameters
    ----------
    n_clusters: int, defaults = 8
        The number of clusters to form as well as the number of
        centroids to generate.
    kernel : string, or callable (default: "gak")
        The kernel should either be "gak", in which case the Global Alignment
        Kernel from [2]_ is used or a value that is accepted as a metric
        by `scikit-learn's pairwise_kernels
        <https://scikit-learn.org/stable/modules/generated/\
        sklearn.metrics.pairwise.pairwise_kernels.html>`_
    n_init: int, defaults = 10
        Number of times the k-means algorithm will be run with different
        centroid seeds. The final result will be the best output of n_init
        consecutive runs in terms of inertia.
    kernel_params : dict or None (default: None)
        Kernel parameters to be passed to the kernel function.
        None means no kernel parameter is set.
        For Global Alignment Kernel, the only parameter of interest is `sigma`.
        If set to 'auto', it is computed based on a sampling of the training
        set
        (cf :ref:`tslearn.metrics.sigma_gak <fun-tslearn.metrics.sigma_gak>`).
        If no specific value is set for `sigma`, its defaults to 1.
    max_iter: int, defaults = 300
        Maximum number of iterations of the k-means algorithm for a single
        run.
    tol: float, defaults = 1e-4
        Relative tolerance with regards to Frobenius norm of the difference
        in the cluster centers of two consecutive iterations to declare
        convergence.
    verbose: bool, defaults = False
        Verbosity mode.
    n_jobs : int or None, optional (default=None)
        The number of jobs to run in parallel for GAK cross-similarity matrix
        computations.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See scikit-learns'
        `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
        for more details.
    random_state: int or np.random.RandomState instance or None, defaults = None
        Determines random number generation for centroid initialization.

    Attributes
    ----------
    labels_: np.ndarray (1d array of shape (n_instance,))
        Labels that is the index each time series belongs to.
    inertia_: float
        Sum of squared distances of samples to their closest cluster center, weighted by
        the sample weights if provided.
    n_iter_: int
        Number of iterations run.
    """

    _tags = {
        "capability:multivariate": True,
    }

    def __init__(
        self,
        n_clusters: int = 8,
        kernel: str = "gak",
        n_init: int = 10,
        max_iter: int = 300,
        tol: float = 1e-4,
        kernel_params: Union[dict, None] = None,
        verbose: bool = False,
        n_jobs: Union[int, None] = None,
        random_state: Union[int, RandomState] = None,
    ):
        _check_soft_dependencies("tslearn", severity="error", object=self)

        self.kernel = kernel
        self.n_init = n_init
        self.max_iter = max_iter
        self.tol = tol
        self.kernel_params = kernel_params
        self.verbose = verbose
        self.n_jobs = n_jobs
        self.random_state = random_state

        self.cluster_centers_ = None
        self.labels_ = None
        self.inertia_ = None
        self.n_iter_ = 0

        self._tslearn_kernel_k_means = None

        super(TimeSeriesKernelKMeans, self).__init__(n_clusters=n_clusters)

    def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Fit time series clusterer to training data.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Training time series instances to cluster.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        self:
            Fitted estimator.
        """
        from tslearn.clustering import KernelKMeans as TsLearnKernelKMeans

        verbose = 0
        if self.verbose is True:
            verbose = 1

        if self._tslearn_kernel_k_means is None:
            self._tslearn_kernel_k_means = TsLearnKernelKMeans(
                n_clusters=self.n_clusters,
                kernel=self.kernel,
                max_iter=self.max_iter,
                tol=self.tol,
                n_init=self.n_init,
                kernel_params=self.kernel_params,
                n_jobs=self.n_jobs,
                verbose=verbose,
                random_state=self.random_state,
            )
        self._tslearn_kernel_k_means.fit(X)
        self.labels_ = self._tslearn_kernel_k_means.labels_
        self.inertia_ = self._tslearn_kernel_k_means.inertia_
        self.n_iter_ = self._tslearn_kernel_k_means.n_iter_

    def _predict(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Predict the closest cluster each sample in X belongs to.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Time series instances to predict their cluster indexes.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        np.ndarray (1d array of shape (n_instances,))
            Index of the cluster each time series in X belongs to.
        """
        return self._tslearn_kernel_k_means.predict(X)

    @classmethod
    def get_test_params(cls, parameter_set="default"):
        """Return testing parameter settings for the estimator.

        Parameters
        ----------
        parameter_set : str, default="default"
            Name of the set of test parameters to return, for use in tests. If no
            special parameters are defined for a value, will return `"default"` set.


        Returns
        -------
        params : dict or list of dict, default = {}
            Parameters to create testing instances of the class
            Each dict are parameters to construct an "interesting" test instance, i.e.,
            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
            `create_test_instance` uses the first (or only) dictionary in `params`
        """
        params = {
            "n_clusters": 2,
            "kernel": "gak",
            "n_init": 1,
            "max_iter": 1,
            "tol": 1e-4,
            "kernel_params": None,
            "verbose": False,
            "n_jobs": 1,
            "random_state": 1,
        }
        return params

    def _score(self, X, y=None):
        return np.abs(self.inertia_)