Example #1
0
    def kneighbors(self, X=None, n_neighbors=None, return_distance=True):
        """Finds the K-neighbors of a point.

        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : array-like, shape (n_ts, sz, d)
            The query time series.
            If not provided, neighbors of each indexed point are returned.
            In this case, the query point is not considered its own neighbor.
        n_neighbors : int
            Number of neighbors to get (default is the value passed to the
            constructor).
        return_distance : boolean, optional. Defaults to True.
            If False, distances will not be returned

        Returns
        -------
        dist : array
            Array representing the distance to points, only present if
            return_distance=True
        ind : array
            Indices of the nearest points in the population matrix.
        """
        if self.metric in TSLEARN_VALID_METRICS:
            self._ts_metric = self.metric
            self.metric = "precomputed"

            metric_params = self._get_metric_params()
            check_is_fitted(self, '_ts_fit')
            X = check_array(X, allow_nd=True, force_all_finite=False)
            X = check_dims(X,
                           X_fit_dims=self._ts_fit.shape,
                           extend=True,
                           check_n_features_only=True)
            if self._ts_metric == "dtw":
                X_ = cdist_dtw(X,
                               self._ts_fit,
                               n_jobs=self.n_jobs,
                               verbose=self.verbose,
                               **metric_params)
            elif self._ts_metric == "softdtw":
                X_ = cdist_soft_dtw(X, self._ts_fit, **metric_params)
            else:
                raise ValueError("Invalid metric recorded: %s" %
                                 self._ts_metric)
            pred = KNeighborsTimeSeriesMixin.kneighbors(
                self,
                X=X_,
                n_neighbors=n_neighbors,
                return_distance=return_distance)
            self.metric = self._ts_metric
            return pred
        else:
            check_is_fitted(self, '_X_fit')
            if X is None:
                X_ = None
            else:
                X = check_array(X, allow_nd=True)
                X = to_time_series_dataset(X)
                X_ = to_sklearn_dataset(X)
                X_ = check_dims(X_, X_fit_dims=self._X_fit.shape, extend=False)
            return KNeighborsTimeSeriesMixin.kneighbors(
                self,
                X=X_,
                n_neighbors=n_neighbors,
                return_distance=return_distance)
Example #2
0
def silhouette_score(X,
                     labels,
                     metric=None,
                     sample_size=None,
                     metric_params=None,
                     random_state=None,
                     **kwds):
    """Compute the mean Silhouette Coefficient of all samples (cf.  [1]_ and  [2]_).

    Read more in the `scikit-learn documentation
    <http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient>`_.

    Parameters
    ----------
    X : array [n_ts, n_ts] if metric == "precomputed", or, \
             [n_ts, sz, d] otherwise
        Array of pairwise distances between time series, or a time series dataset.
    labels : array, shape = [n_ts]
         Predicted labels for each time series.
    metric : string, or callable
        The metric to use when calculating distance between time series.
        Should be one of {'dtw', 'softdtw', 'euclidean'} or a callable distance
        function.
        If X is the distance array itself, use ``metric="precomputed"``.
    sample_size : int or None
        The size of the sample to use when computing the Silhouette Coefficient
        on a random subset of the data.
        If ``sample_size is None``, no sampling is used.
    metric_params : dict or None
        Parameter values for the chosen metric. Value associated to the `"gamma_sdtw"` key corresponds to the gamma
        parameter in Soft-DTW.
    random_state : int, RandomState instance or None, optional (default=None)
        The generator used to randomly select a subset of samples.  If int,
        random_state is the seed used by the random number generator; If
        RandomState instance, random_state is the random number generator; If
        None, the random number generator is the RandomState instance used by
        `np.random`. Used when ``sample_size is not None``.
    **kwds : optional keyword parameters
        Any further parameters are passed directly to the distance function.
    Returns
    -------
    silhouette : float
        Mean Silhouette Coefficient for all samples.
    References
    ----------
    .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
       Interpretation and Validation of Cluster Analysis". Computational
       and Applied Mathematics 20: 53-65.
       <http://www.sciencedirect.com/science/article/pii/0377042787901257>`_
    .. [2] `Wikipedia entry on the Silhouette Coefficient
           <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
    Examples
    --------
    >>> from tslearn.generators import random_walks
    >>> from tslearn.metrics import cdist_dtw
    >>> X = random_walks(n_ts=50, sz=32, d=1)
    >>> labels = numpy.random.randint(2, size=50)
    >>> s_sc = silhouette_score(X, labels, metric="dtw")
    >>> s_sc2 = silhouette_score(X, labels, metric="euclidean")
    >>> s_sc3 = silhouette_score(X, labels, metric="softdtw")
    >>> s_sc3b = silhouette_score(X, labels, metric="softdtw", metric_params={"gamma_sdtw": 2.})
    >>> s_sc4 = silhouette_score(cdist_dtw(X), labels, metric="precomputed")
    """
    sklearn_metric = None
    if metric_params is None:
        metric_params = {}
    if metric == "precomputed":
        sklearn_X = X
    elif metric == "dtw":
        sklearn_X = cdist_dtw(X)
    elif metric == "softdtw":
        gamma = metric_params.get("gamma_sdtw", None)
        if gamma is not None:
            sklearn_X = cdist_soft_dtw(X, gamma=gamma)
        else:
            sklearn_X = cdist_soft_dtw(X)
    elif metric == "euclidean":
        X_ = to_time_series_dataset(X)
        X_ = X_.reshape((X.shape[0], -1))
        sklearn_X = cdist(X_, X_, metric="euclidean")
    else:
        X_ = to_time_series_dataset(X)
        n, sz, d = X_.shape
        sklearn_X = X_.reshape((n, -1))
        if metric is None:
            metric = dtw
        sklearn_metric = lambda x, y: metric(
            to_time_series(x.reshape((sz, d)), remove_nans=True),
            to_time_series(y.reshape((sz, d)), remove_nans=True))
    return sklearn_silhouette_score(
        X=sklearn_X,
        labels=labels,
        metric="precomputed" if sklearn_metric is None else sklearn_metric,
        sample_size=sample_size,
        random_state=random_state,
        **kwds)
Example #3
0
def dynamic_time_wrap(data):
    from tslearn.metrics import cdist_dtw
    print('|--- Construct adj_mx by DTW')
    dtw_mx_dist = cdist_dtw(data.transpose())
    return dtw_mx_dist
Example #4
0
 def metric_fun(x, y):
     return cdist_dtw(x, y, n_jobs=self.n_jobs,
                      verbose=self.verbose, **metric_params)
Example #5
0
def silhouette_score(X, labels, metric=None, sample_size=None,
                     metric_params=None, n_jobs=None, verbose=0, 
                     random_state=None, **kwds):
    """Compute the mean Silhouette Coefficient of all samples (cf.  [1]_ and
    [2]_).

    Read more in the `scikit-learn documentation
    <http://scikit-learn.org/stable/modules/clustering.html\
    #silhouette-coefficient>`_.

    Parameters
    ----------
    X : array [n_ts, n_ts] if metric == "precomputed", or, \
             [n_ts, sz, d] otherwise
        Array of pairwise distances between time series, or a time series
        dataset.
    labels : array, shape = [n_ts]
         Predicted labels for each time series.
    metric : string, callable or None (default: None)
        The metric to use when calculating distance between time series.
        Should be one of {'dtw', 'softdtw', 'euclidean'} or a callable distance
        function or None.
        If 'softdtw' is passed, a normalized version of Soft-DTW is used that
        is defined as `sdtw_(x,y) := sdtw(x,y) - 1/2(sdtw(x,x)+sdtw(y,y))`.
        If X is the distance array itself, use ``metric="precomputed"``.
        If None, dtw is used.
    sample_size : int or None (default: None)
        The size of the sample to use when computing the Silhouette Coefficient
        on a random subset of the data.
        If ``sample_size is None``, no sampling is used.
    metric_params : dict or None (default: None)
        Parameter values for the chosen metric.
        For metrics that accept parallelization of the cross-distance matrix
        computations, `n_jobs` key passed in `metric_params` is overridden by
        the `n_jobs` argument.
        Value associated to the `"gamma_sdtw"` key corresponds to the gamma
        parameter in Soft-DTW.

        .. deprecated:: 0.2
            `"gamma_sdtw"` as a key for `metric_params` is deprecated in
            version 0.2 and will be removed in 0.4.

    n_jobs : int or None, optional (default=None)
        The number of jobs to run in parallel for cross-distance matrix
        computations.
        Ignored if the cross-distance matrix cannot be computed using
        parallelization.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See scikit-learns'
        `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_
        for more details.

    verbose : int (default: 0)
        If nonzero, print information about the inertia while learning
        the model and joblib progress messages are printed.  

    random_state : int, RandomState instance or None, optional (default: None)
        The generator used to randomly select a subset of samples.  If int,
        random_state is the seed used by the random number generator; If
        RandomState instance, random_state is the random number generator; If
        None, the random number generator is the RandomState instance used by
        `np.random`. Used when ``sample_size is not None``.
    **kwds : optional keyword parameters
        Any further parameters are passed directly to the distance function,
        just as for the `metric_params` parameter.

    Returns
    -------
    silhouette : float
        Mean Silhouette Coefficient for all samples.

    References
    ----------
    .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
       Interpretation and Validation of Cluster Analysis". Computational
       and Applied Mathematics 20: 53-65.
       <http://www.sciencedirect.com/science/article/pii/0377042787901257>`_
    .. [2] `Wikipedia entry on the Silhouette Coefficient
           <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_

    Examples
    --------
    >>> from tslearn.generators import random_walks
    >>> from tslearn.metrics import cdist_dtw
    >>> numpy.random.seed(0)
    >>> X = random_walks(n_ts=20, sz=16, d=1)
    >>> labels = numpy.random.randint(2, size=20)
    >>> silhouette_score(X, labels, metric="dtw")  # doctest: +ELLIPSIS
    0.13383800...
    >>> silhouette_score(X, labels, metric="euclidean")  # doctest: +ELLIPSIS
    0.09126917...
    >>> silhouette_score(X, labels, metric="softdtw")  # doctest: +ELLIPSIS
    0.17953934...
    >>> silhouette_score(X, labels, metric="softdtw",
    ...                  metric_params={"gamma": 2.}) \
    # doctest: +ELLIPSIS
    0.17591060...
    >>> silhouette_score(cdist_dtw(X), labels,
    ...                  metric="precomputed")  # doctest: +ELLIPSIS
    0.13383800...
    """
    sklearn_metric = None
    if metric_params is None:
        metric_params_ = {}
    else:
        metric_params_ = metric_params.copy()
    if "gamma_sdtw" in metric_params_.keys():
        warnings.warn(
            "'gamma_sdtw' is deprecated in version 0.2 and will be "
            "removed in 0.4. Use `gamma` instead of `gamma_sdtw` as a "
            "`metric_params` key to remove this warning.",
            DeprecationWarning, stacklevel=2)
        metric_params_["gamma"] = metric_params_["gamma_sdtw"]
        del metric_params_["gamma_sdtw"]
    for k in kwds.keys():
        metric_params_[k] = kwds[k]
    if "n_jobs" in metric_params_.keys():
        del metric_params_["n_jobs"]
    if metric == "precomputed":
        sklearn_X = X
    elif metric == "dtw" or metric is None:
        sklearn_X = cdist_dtw(X, n_jobs=n_jobs, verbose=verbose,
                              **metric_params_)
    elif metric == "softdtw":
        sklearn_X = cdist_soft_dtw_normalized(X, **metric_params_)
    elif metric == "euclidean":
        X_ = to_time_series_dataset(X)
        X_ = X_.reshape((X.shape[0], -1))
        sklearn_X = cdist(X_, X_, metric="euclidean")
    else:
        X_ = to_time_series_dataset(X)
        n, sz, d = X_.shape
        sklearn_X = X_.reshape((n, -1))

        def sklearn_metric(x, y):
            return metric(to_time_series(x.reshape((sz, d)),
                                         remove_nans=True),
                          to_time_series(y.reshape((sz, d)),
                                         remove_nans=True))
    metric = "precomputed" if sklearn_metric is None else sklearn_metric
    return sklearn_silhouette_score(X=sklearn_X,
                                    labels=labels,
                                    metric=metric,
                                    sample_size=sample_size,
                                    random_state=random_state,
                                    **kwds)
def test_kmeans():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)

    km = TimeSeriesKMeans(n_clusters=3,
                          metric="euclidean",
                          max_iter=5,
                          verbose=False,
                          random_state=rng).fit(time_series)
    dists = cdist(time_series.reshape((n, -1)),
                  km.cluster_centers_.reshape((3, -1)))
    np.testing.assert_allclose(km.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(km.labels_, km.predict(time_series))

    km_dba = TimeSeriesKMeans(n_clusters=3,
                              metric="dtw",
                              max_iter=5,
                              verbose=False,
                              random_state=rng).fit(time_series)
    dists = cdist_dtw(time_series, km_dba.cluster_centers_)
    np.testing.assert_allclose(km_dba.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(km_dba.labels_, km_dba.predict(time_series))

    km_sdtw = TimeSeriesKMeans(n_clusters=3,
                               metric="softdtw",
                               max_iter=5,
                               verbose=False,
                               random_state=rng).fit(time_series)
    dists = cdist_soft_dtw(time_series, km_sdtw.cluster_centers_)
    np.testing.assert_allclose(km_sdtw.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(km_sdtw.labels_, km_sdtw.predict(time_series))

    km_nofit = TimeSeriesKMeans(n_clusters=101,
                                verbose=False,
                                random_state=rng).fit(time_series)
    assert (km_nofit._X_fit is None)

    X_bis = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3],
                                    [2, 5, 6, 7, 8, 9]])
    TimeSeriesKMeans(n_clusters=2,
                     verbose=False,
                     max_iter=5,
                     metric="softdtw",
                     random_state=0).fit(X_bis)
    TimeSeriesKMeans(n_clusters=2,
                     verbose=False,
                     max_iter=5,
                     metric="dtw",
                     random_state=0,
                     init="random").fit(X_bis)
    TimeSeriesKMeans(n_clusters=2,
                     verbose=False,
                     max_iter=5,
                     metric="dtw",
                     random_state=0,
                     init="k-means++").fit(X_bis)
    TimeSeriesKMeans(n_clusters=2,
                     verbose=False,
                     max_iter=5,
                     metric="dtw",
                     init=X_bis[:2]).fit(X_bis)

    # Barycenter size (nb of timestamps)
    # Case 1. kmeans++ / random init
    n, sz, d = 15, 10, 1
    n_clusters = 3
    time_series = rng.randn(n, sz, d)

    sizes_all_same_series = [sz] * n_clusters
    km_euc = TimeSeriesKMeans(n_clusters=3,
                              metric="euclidean",
                              max_iter=5,
                              verbose=False,
                              init="k-means++",
                              random_state=rng).fit(time_series)
    np.testing.assert_equal(sizes_all_same_series,
                            [ts_size(b) for b in km_euc.cluster_centers_])
    km_dba = TimeSeriesKMeans(n_clusters=3,
                              metric="dtw",
                              max_iter=5,
                              verbose=False,
                              init="random",
                              random_state=rng).fit(time_series)
    np.testing.assert_equal(sizes_all_same_series,
                            [ts_size(b) for b in km_dba.cluster_centers_])

    # Case 2. forced init
    barys = to_time_series_dataset([[1., 2., 3.], [1., 2., 2., 3., 4.],
                                    [3., 2., 1.]])
    sizes_all_same_bary = [barys.shape[1]] * n_clusters
    # If Euclidean is used, barycenters size should be that of the input series
    km_euc = TimeSeriesKMeans(n_clusters=3,
                              metric="euclidean",
                              max_iter=5,
                              verbose=False,
                              init=barys,
                              random_state=rng)
    np.testing.assert_raises(ValueError, km_euc.fit, time_series)

    km_dba = TimeSeriesKMeans(n_clusters=3,
                              metric="dtw",
                              max_iter=5,
                              verbose=False,
                              init=barys,
                              random_state=rng).fit(time_series)
    np.testing.assert_equal(sizes_all_same_bary,
                            [ts_size(b) for b in km_dba.cluster_centers_])
    km_sdtw = TimeSeriesKMeans(n_clusters=3,
                               metric="softdtw",
                               max_iter=5,
                               verbose=False,
                               init=barys,
                               random_state=rng).fit(time_series)
    np.testing.assert_equal(sizes_all_same_bary,
                            [ts_size(b) for b in km_sdtw.cluster_centers_])

    # A simple dataset, can we extract the correct number of clusters?
    time_series = to_time_series_dataset([[1, 2, 3], [7, 8, 9, 11],
                                          [.1, .2, 2.], [1, 1, 1, 9],
                                          [10, 20, 30, 1000]])
    preds = TimeSeriesKMeans(n_clusters=3,
                             metric="dtw",
                             max_iter=5,
                             random_state=rng).fit_predict(time_series)
    np.testing.assert_equal(set(preds), set(range(3)))
    preds = TimeSeriesKMeans(n_clusters=4,
                             metric="dtw",
                             max_iter=5,
                             random_state=rng).fit_predict(time_series)
    np.testing.assert_equal(set(preds), set(range(4)))
Example #7
0
def to_distances(ts_dataset):
    m = cdist_dtw(ts_dataset)
    m = ssd.squareform(m)
    return m