def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_ts, sz, d) The query time series. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the distance to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ if self.metric in TSLEARN_VALID_METRICS: self._ts_metric = self.metric self.metric = "precomputed" metric_params = self._get_metric_params() check_is_fitted(self, '_ts_fit') X = check_array(X, allow_nd=True, force_all_finite=False) X = check_dims(X, X_fit_dims=self._ts_fit.shape, extend=True, check_n_features_only=True) if self._ts_metric == "dtw": X_ = cdist_dtw(X, self._ts_fit, n_jobs=self.n_jobs, verbose=self.verbose, **metric_params) elif self._ts_metric == "softdtw": X_ = cdist_soft_dtw(X, self._ts_fit, **metric_params) else: raise ValueError("Invalid metric recorded: %s" % self._ts_metric) pred = KNeighborsTimeSeriesMixin.kneighbors( self, X=X_, n_neighbors=n_neighbors, return_distance=return_distance) self.metric = self._ts_metric return pred else: check_is_fitted(self, '_X_fit') if X is None: X_ = None else: X = check_array(X, allow_nd=True) X = to_time_series_dataset(X) X_ = to_sklearn_dataset(X) X_ = check_dims(X_, X_fit_dims=self._X_fit.shape, extend=False) return KNeighborsTimeSeriesMixin.kneighbors( self, X=X_, n_neighbors=n_neighbors, return_distance=return_distance)
def silhouette_score(X, labels, metric=None, sample_size=None, metric_params=None, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples (cf. [1]_ and [2]_). Read more in the `scikit-learn documentation <http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient>`_. Parameters ---------- X : array [n_ts, n_ts] if metric == "precomputed", or, \ [n_ts, sz, d] otherwise Array of pairwise distances between time series, or a time series dataset. labels : array, shape = [n_ts] Predicted labels for each time series. metric : string, or callable The metric to use when calculating distance between time series. Should be one of {'dtw', 'softdtw', 'euclidean'} or a callable distance function. If X is the distance array itself, use ``metric="precomputed"``. sample_size : int or None The size of the sample to use when computing the Silhouette Coefficient on a random subset of the data. If ``sample_size is None``, no sampling is used. metric_params : dict or None Parameter values for the chosen metric. Value associated to the `"gamma_sdtw"` key corresponds to the gamma parameter in Soft-DTW. random_state : int, RandomState instance or None, optional (default=None) The generator used to randomly select a subset of samples. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``sample_size is not None``. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. Returns ------- silhouette : float Mean Silhouette Coefficient for all samples. References ---------- .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis". Computational and Applied Mathematics 20: 53-65. <http://www.sciencedirect.com/science/article/pii/0377042787901257>`_ .. [2] `Wikipedia entry on the Silhouette Coefficient <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_ Examples -------- >>> from tslearn.generators import random_walks >>> from tslearn.metrics import cdist_dtw >>> X = random_walks(n_ts=50, sz=32, d=1) >>> labels = numpy.random.randint(2, size=50) >>> s_sc = silhouette_score(X, labels, metric="dtw") >>> s_sc2 = silhouette_score(X, labels, metric="euclidean") >>> s_sc3 = silhouette_score(X, labels, metric="softdtw") >>> s_sc3b = silhouette_score(X, labels, metric="softdtw", metric_params={"gamma_sdtw": 2.}) >>> s_sc4 = silhouette_score(cdist_dtw(X), labels, metric="precomputed") """ sklearn_metric = None if metric_params is None: metric_params = {} if metric == "precomputed": sklearn_X = X elif metric == "dtw": sklearn_X = cdist_dtw(X) elif metric == "softdtw": gamma = metric_params.get("gamma_sdtw", None) if gamma is not None: sklearn_X = cdist_soft_dtw(X, gamma=gamma) else: sklearn_X = cdist_soft_dtw(X) elif metric == "euclidean": X_ = to_time_series_dataset(X) X_ = X_.reshape((X.shape[0], -1)) sklearn_X = cdist(X_, X_, metric="euclidean") else: X_ = to_time_series_dataset(X) n, sz, d = X_.shape sklearn_X = X_.reshape((n, -1)) if metric is None: metric = dtw sklearn_metric = lambda x, y: metric( to_time_series(x.reshape((sz, d)), remove_nans=True), to_time_series(y.reshape((sz, d)), remove_nans=True)) return sklearn_silhouette_score( X=sklearn_X, labels=labels, metric="precomputed" if sklearn_metric is None else sklearn_metric, sample_size=sample_size, random_state=random_state, **kwds)
def dynamic_time_wrap(data): from tslearn.metrics import cdist_dtw print('|--- Construct adj_mx by DTW') dtw_mx_dist = cdist_dtw(data.transpose()) return dtw_mx_dist
def metric_fun(x, y): return cdist_dtw(x, y, n_jobs=self.n_jobs, verbose=self.verbose, **metric_params)
def silhouette_score(X, labels, metric=None, sample_size=None, metric_params=None, n_jobs=None, verbose=0, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples (cf. [1]_ and [2]_). Read more in the `scikit-learn documentation <http://scikit-learn.org/stable/modules/clustering.html\ #silhouette-coefficient>`_. Parameters ---------- X : array [n_ts, n_ts] if metric == "precomputed", or, \ [n_ts, sz, d] otherwise Array of pairwise distances between time series, or a time series dataset. labels : array, shape = [n_ts] Predicted labels for each time series. metric : string, callable or None (default: None) The metric to use when calculating distance between time series. Should be one of {'dtw', 'softdtw', 'euclidean'} or a callable distance function or None. If 'softdtw' is passed, a normalized version of Soft-DTW is used that is defined as `sdtw_(x,y) := sdtw(x,y) - 1/2(sdtw(x,x)+sdtw(y,y))`. If X is the distance array itself, use ``metric="precomputed"``. If None, dtw is used. sample_size : int or None (default: None) The size of the sample to use when computing the Silhouette Coefficient on a random subset of the data. If ``sample_size is None``, no sampling is used. metric_params : dict or None (default: None) Parameter values for the chosen metric. For metrics that accept parallelization of the cross-distance matrix computations, `n_jobs` key passed in `metric_params` is overridden by the `n_jobs` argument. Value associated to the `"gamma_sdtw"` key corresponds to the gamma parameter in Soft-DTW. .. deprecated:: 0.2 `"gamma_sdtw"` as a key for `metric_params` is deprecated in version 0.2 and will be removed in 0.4. n_jobs : int or None, optional (default=None) The number of jobs to run in parallel for cross-distance matrix computations. Ignored if the cross-distance matrix cannot be computed using parallelization. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See scikit-learns' `Glossary <https://scikit-learn.org/stable/glossary.html#term-n-jobs>`_ for more details. verbose : int (default: 0) If nonzero, print information about the inertia while learning the model and joblib progress messages are printed. random_state : int, RandomState instance or None, optional (default: None) The generator used to randomly select a subset of samples. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``sample_size is not None``. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function, just as for the `metric_params` parameter. Returns ------- silhouette : float Mean Silhouette Coefficient for all samples. References ---------- .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis". Computational and Applied Mathematics 20: 53-65. <http://www.sciencedirect.com/science/article/pii/0377042787901257>`_ .. [2] `Wikipedia entry on the Silhouette Coefficient <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_ Examples -------- >>> from tslearn.generators import random_walks >>> from tslearn.metrics import cdist_dtw >>> numpy.random.seed(0) >>> X = random_walks(n_ts=20, sz=16, d=1) >>> labels = numpy.random.randint(2, size=20) >>> silhouette_score(X, labels, metric="dtw") # doctest: +ELLIPSIS 0.13383800... >>> silhouette_score(X, labels, metric="euclidean") # doctest: +ELLIPSIS 0.09126917... >>> silhouette_score(X, labels, metric="softdtw") # doctest: +ELLIPSIS 0.17953934... >>> silhouette_score(X, labels, metric="softdtw", ... metric_params={"gamma": 2.}) \ # doctest: +ELLIPSIS 0.17591060... >>> silhouette_score(cdist_dtw(X), labels, ... metric="precomputed") # doctest: +ELLIPSIS 0.13383800... """ sklearn_metric = None if metric_params is None: metric_params_ = {} else: metric_params_ = metric_params.copy() if "gamma_sdtw" in metric_params_.keys(): warnings.warn( "'gamma_sdtw' is deprecated in version 0.2 and will be " "removed in 0.4. Use `gamma` instead of `gamma_sdtw` as a " "`metric_params` key to remove this warning.", DeprecationWarning, stacklevel=2) metric_params_["gamma"] = metric_params_["gamma_sdtw"] del metric_params_["gamma_sdtw"] for k in kwds.keys(): metric_params_[k] = kwds[k] if "n_jobs" in metric_params_.keys(): del metric_params_["n_jobs"] if metric == "precomputed": sklearn_X = X elif metric == "dtw" or metric is None: sklearn_X = cdist_dtw(X, n_jobs=n_jobs, verbose=verbose, **metric_params_) elif metric == "softdtw": sklearn_X = cdist_soft_dtw_normalized(X, **metric_params_) elif metric == "euclidean": X_ = to_time_series_dataset(X) X_ = X_.reshape((X.shape[0], -1)) sklearn_X = cdist(X_, X_, metric="euclidean") else: X_ = to_time_series_dataset(X) n, sz, d = X_.shape sklearn_X = X_.reshape((n, -1)) def sklearn_metric(x, y): return metric(to_time_series(x.reshape((sz, d)), remove_nans=True), to_time_series(y.reshape((sz, d)), remove_nans=True)) metric = "precomputed" if sklearn_metric is None else sklearn_metric return sklearn_silhouette_score(X=sklearn_X, labels=labels, metric=metric, sample_size=sample_size, random_state=random_state, **kwds)
def test_kmeans(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) km = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist(time_series.reshape((n, -1)), km.cluster_centers_.reshape((3, -1))) np.testing.assert_allclose(km.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km.labels_, km.predict(time_series)) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_dtw(time_series, km_dba.cluster_centers_) np.testing.assert_allclose(km_dba.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_dba.labels_, km_dba.predict(time_series)) km_sdtw = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_soft_dtw(time_series, km_sdtw.cluster_centers_) np.testing.assert_allclose(km_sdtw.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_sdtw.labels_, km_sdtw.predict(time_series)) km_nofit = TimeSeriesKMeans(n_clusters=101, verbose=False, random_state=rng).fit(time_series) assert (km_nofit._X_fit is None) X_bis = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9]]) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="softdtw", random_state=0).fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="random").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="k-means++").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", init=X_bis[:2]).fit(X_bis) # Barycenter size (nb of timestamps) # Case 1. kmeans++ / random init n, sz, d = 15, 10, 1 n_clusters = 3 time_series = rng.randn(n, sz, d) sizes_all_same_series = [sz] * n_clusters km_euc = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, init="k-means++", random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_series, [ts_size(b) for b in km_euc.cluster_centers_]) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, init="random", random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_series, [ts_size(b) for b in km_dba.cluster_centers_]) # Case 2. forced init barys = to_time_series_dataset([[1., 2., 3.], [1., 2., 2., 3., 4.], [3., 2., 1.]]) sizes_all_same_bary = [barys.shape[1]] * n_clusters # If Euclidean is used, barycenters size should be that of the input series km_euc = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, init=barys, random_state=rng) np.testing.assert_raises(ValueError, km_euc.fit, time_series) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, init=barys, random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_bary, [ts_size(b) for b in km_dba.cluster_centers_]) km_sdtw = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5, verbose=False, init=barys, random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_bary, [ts_size(b) for b in km_sdtw.cluster_centers_]) # A simple dataset, can we extract the correct number of clusters? time_series = to_time_series_dataset([[1, 2, 3], [7, 8, 9, 11], [.1, .2, 2.], [1, 1, 1, 9], [10, 20, 30, 1000]]) preds = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, random_state=rng).fit_predict(time_series) np.testing.assert_equal(set(preds), set(range(3))) preds = TimeSeriesKMeans(n_clusters=4, metric="dtw", max_iter=5, random_state=rng).fit_predict(time_series) np.testing.assert_equal(set(preds), set(range(4)))
def to_distances(ts_dataset): m = cdist_dtw(ts_dataset) m = ssd.squareform(m) return m