def _precompute_cross_dist(self, X, other_X=None): if other_X is None: other_X = self._ts_fit self._ts_metric = self.metric self.metric = "precomputed" metric_params = self._get_metric_params() X = check_array(X, allow_nd=True, force_all_finite=False) X = to_time_series_dataset(X) if self._ts_metric == "dtw": X_ = cdist_dtw(X, other_X, n_jobs=self.n_jobs, **metric_params) elif self._ts_metric == "ctw": X_ = cdist_ctw(X, other_X, **metric_params) elif self._ts_metric == "softdtw": X_ = cdist_soft_dtw(X, other_X, **metric_params) elif self._ts_metric == "sax": X = self._sax_preprocess(X, **metric_params) X_ = cdist_sax(X, self._sax.breakpoints_avg_, self._sax._X_fit_dims_[1], other_X, n_jobs=self.n_jobs) else: raise ValueError("Invalid metric recorded: %s" % self._ts_metric) return X_
def _assign(self, X, update_class_attributes=True): if self.metric == "euclidean": dists = cdist(X.reshape((X.shape[0], -1)), self.cluster_centers_.reshape((self.n_clusters, -1)), metric="euclidean") elif self.metric == "dtw": dists = cdist_dtw(X, self.cluster_centers_) elif self.metric == "softdtw": dists = cdist_soft_dtw(X, self.cluster_centers_, gamma=self.gamma_sdtw) else: raise ValueError( "Incorrect metric: %s (should be one of 'dtw', 'softdtw', 'euclidean')" % self.metric) matched_labels = dists.argmin(axis=1) if update_class_attributes: self.labels_ = matched_labels _check_no_empty_cluster(self.labels_, self.n_clusters) if self.dtw_inertia and self.metric != "dtw": inertia_dists = cdist_dtw(X, self.cluster_centers_) else: inertia_dists = dists self.inertia_ = _compute_inertia(inertia_dists, self.labels_, self._squared_inertia) return matched_labels
def predict_proba(self, X): """Predict the class probabilities for the provided data Parameters ---------- X : array-like, shape (n_ts, sz, d) Test samples. Returns ------- array, shape = (n_ts, n_classes) Array of predicted class probabilities """ if self.metric in VARIABLE_LENGTH_METRICS: self._ts_metric = self.metric self.metric = "precomputed" if self.metric_params is None: metric_params = {} else: metric_params = self.metric_params.copy() if "n_jobs" in metric_params.keys(): del metric_params["n_jobs"] if "verbose" in metric_params.keys(): del metric_params["verbose"] check_is_fitted(self, '_ts_fit') X = check_array(X, allow_nd=True, force_all_finite=False) X = to_time_series_dataset(X) if self._ts_metric == "dtw": X_ = cdist_dtw(X, self._ts_fit, n_jobs=self.n_jobs, verbose=self.verbose, **metric_params) elif self._ts_metric == "softdtw": X_ = cdist_soft_dtw(X, self._ts_fit, **metric_params) else: raise ValueError("Invalid metric recorded: %s" % self._ts_metric) pred = super(KNeighborsTimeSeriesClassifier, self).predict_proba(X_) self.metric = self._ts_metric return pred else: check_is_fitted(self, '_X_fit') X = check_array(X, allow_nd=True) X = to_time_series_dataset(X) X_ = to_sklearn_dataset(X) X_ = check_dims(X_, self._X_fit, extend=False) return super(KNeighborsTimeSeriesClassifier, self).predict_proba(X_)
def _transform(self, X): metric_params = self._get_metric_params() if self.metric == "euclidean": return cdist(X.reshape((X.shape[0], -1)), self.cluster_centers_.reshape((self.n_clusters, -1)), metric="euclidean") elif self.metric == "dtw": return cdist_dtw(X, self.cluster_centers_, n_jobs=self.n_jobs, verbose=self.verbose, **metric_params) elif self.metric == "softdtw": return cdist_soft_dtw(X, self.cluster_centers_, **metric_params) else: raise ValueError("Incorrect metric: %s (should be one of 'dtw', " "'softdtw', 'euclidean')" % self.metric)
def _assign(self, X, update_class_attributes=True): if self.metric_params is None: metric_params = {} else: metric_params = self.metric_params.copy() if "gamma_sdtw" in metric_params.keys(): metric_params["gamma"] = metric_params["gamma_sdtw"] del metric_params["gamma_sdtw"] if "n_jobs" in metric_params.keys(): del metric_params["n_jobs"] if self.metric == "euclidean": dists = cdist(X.reshape((X.shape[0], -1)), self.cluster_centers_.reshape((self.n_clusters, -1)), metric="euclidean") elif self.metric == "dtw": dists = cdist_dtw(X, self.cluster_centers_, n_jobs=self.n_jobs, verbose=self.verbose, **metric_params) elif self.metric == "softdtw": dists = cdist_soft_dtw(X, self.cluster_centers_, **metric_params) else: raise ValueError("Incorrect metric: %s (should be one of 'dtw', " "'softdtw', 'euclidean')" % self.metric) matched_labels = dists.argmin(axis=1) if update_class_attributes: self.labels_ = matched_labels _check_no_empty_cluster(self.labels_, self.n_clusters) if self.dtw_inertia and self.metric != "dtw": inertia_dists = cdist_dtw(X, self.cluster_centers_, n_jobs=self.n_jobs, verbose=self.verbose) else: inertia_dists = dists self.inertia_ = _compute_inertia(inertia_dists, self.labels_, self._squared_inertia) return matched_labels
for i in range(0, 15): plt.plot(x, sample5density[i], 'k-', alpha=.2) plt.plot(x, scaler_density_train.inverse_transform(density_5), 'r-', label='density') plt.xlabel('hours of the day') plt.ylabel('density') plt.title('k=4') plt.legend() plt.show() #similarity between centroids of the clusters from tslearn.metrics import soft_dtw, cdist_soft_dtw similarity = [] matrix = cdist_soft_dtw(centroids, gamma=1.) matrix sim = matrix.max() similarity.append(sim) similarity = np.array(similarity) diss = list(-similarity) cluster = np.arange(2, 8) plt.title('soft-DTW similarity measure S60') plt.plot(cluster, diss) plt.xlabel('n° of cluster') plt.ylabel('similarity between closest clusters') plt.show() #visualization
def kneighbors(self, X=None, n_neighbors=None, return_distance=True): """Finds the K-neighbors of a point. Returns indices of and distances to the neighbors of each point. Parameters ---------- X : array-like, shape (n_ts, sz, d) The query time series. If not provided, neighbors of each indexed point are returned. In this case, the query point is not considered its own neighbor. n_neighbors : int Number of neighbors to get (default is the value passed to the constructor). return_distance : boolean, optional. Defaults to True. If False, distances will not be returned Returns ------- dist : array Array representing the distance to points, only present if return_distance=True ind : array Indices of the nearest points in the population matrix. """ if self.metric in VARIABLE_LENGTH_METRICS: self._ts_metric = self.metric self.metric = "precomputed" if self.metric_params is None: metric_params = {} else: metric_params = self.metric_params.copy() if "n_jobs" in metric_params.keys(): del metric_params["n_jobs"] if "verbose" in metric_params.keys(): del metric_params["verbose"] check_is_fitted(self, '_ts_fit') X = check_array(X, allow_nd=True, force_all_finite=False) X = to_time_series_dataset(X) if self._ts_metric == "dtw": X_ = cdist_dtw(X, self._ts_fit, n_jobs=self.n_jobs, verbose=self.verbose, **metric_params) elif self._ts_metric == "softdtw": X_ = cdist_soft_dtw(X, self._ts_fit, **metric_params) else: raise ValueError("Invalid metric recorded: %s" % self._ts_metric) pred = KNeighborsTimeSeriesMixin.kneighbors( self, X=X_, n_neighbors=n_neighbors, return_distance=return_distance) self.metric = self._ts_metric return pred else: check_is_fitted(self, '_X_fit') if X is None: X_ = None else: X = check_array(X, allow_nd=True) X = to_time_series_dataset(X) X_ = to_sklearn_dataset(X) X_ = check_dims(X_, self._X_fit, extend=False) return KNeighborsTimeSeriesMixin.kneighbors( self, X=X_, n_neighbors=n_neighbors, return_distance=return_distance)
def silhouette_score(X, labels, metric=None, sample_size=None, metric_params=None, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples (cf. [1]_ and [2]_). Read more in the `scikit-learn documentation <http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient>`_. Parameters ---------- X : array [n_ts, n_ts] if metric == "precomputed", or, \ [n_ts, sz, d] otherwise Array of pairwise distances between time series, or a time series dataset. labels : array, shape = [n_ts] Predicted labels for each time series. metric : string, or callable The metric to use when calculating distance between time series. Should be one of {'dtw', 'softdtw', 'euclidean'} or a callable distance function. If X is the distance array itself, use ``metric="precomputed"``. sample_size : int or None The size of the sample to use when computing the Silhouette Coefficient on a random subset of the data. If ``sample_size is None``, no sampling is used. metric_params : dict or None Parameter values for the chosen metric. Value associated to the `"gamma_sdtw"` key corresponds to the gamma parameter in Soft-DTW. random_state : int, RandomState instance or None, optional (default=None) The generator used to randomly select a subset of samples. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``sample_size is not None``. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. Returns ------- silhouette : float Mean Silhouette Coefficient for all samples. References ---------- .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis". Computational and Applied Mathematics 20: 53-65. <http://www.sciencedirect.com/science/article/pii/0377042787901257>`_ .. [2] `Wikipedia entry on the Silhouette Coefficient <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_ Examples -------- >>> from tslearn.generators import random_walks >>> from tslearn.metrics import cdist_dtw >>> X = random_walks(n_ts=50, sz=32, d=1) >>> labels = numpy.random.randint(2, size=50) >>> s_sc = silhouette_score(X, labels, metric="dtw") >>> s_sc2 = silhouette_score(X, labels, metric="softdtw") >>> s_sc2b = silhouette_score(X, labels, metric="softdtw", metric_params={"gamma_sdtw": 2.}) >>> s_sc3 = silhouette_score(cdist_dtw(X), labels, metric="precomputed") """ sklearn_metric = None if metric_params is None: metric_params = {} if metric == "precomputed": sklearn_X = X elif metric == "dtw": sklearn_X = cdist_dtw(X) elif metric == "softdtw": gamma = metric_params.get("gamma_sdtw", None) if gamma is not None: sklearn_X = cdist_soft_dtw(X, gamma=gamma) else: sklearn_X = cdist_soft_dtw(X) elif metric == "euclidean": sklearn_X = cdist(X, X, metric="euclidean") else: X_ = to_time_series_dataset(X) n, sz, d = X_.shape sklearn_X = X_.reshape((n, -1)) if metric is None: metric = dtw sklearn_metric = lambda x, y: metric( to_time_series(x.reshape((sz, d)), remove_nans=True), to_time_series(y.reshape((sz, d)), remove_nans=True)) return sklearn_silhouette_score( X=sklearn_X, labels=labels, metric="precomputed" if sklearn_metric is None else sklearn_metric, sample_size=sample_size, random_state=random_state, **kwds)
def test_kmeans(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) km = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist(time_series.reshape((n, -1)), km.cluster_centers_.reshape((3, -1))) np.testing.assert_allclose(km.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km.labels_, km.predict(time_series)) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_dtw(time_series, km_dba.cluster_centers_) np.testing.assert_allclose(km_dba.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_dba.labels_, km_dba.predict(time_series)) km_sdtw = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_soft_dtw(time_series, km_sdtw.cluster_centers_) np.testing.assert_allclose(km_sdtw.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_sdtw.labels_, km_sdtw.predict(time_series)) km_nofit = TimeSeriesKMeans(n_clusters=101, verbose=False, random_state=rng).fit(time_series) assert(km_nofit._X_fit is None) X_bis = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9]]) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="softdtw", random_state=0).fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="random").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="k-means++").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", init=X_bis[:2]).fit(X_bis) # Barycenter size (nb of timestamps) # Case 1. kmeans++ / random init n, sz, d = 15, 10, 1 n_clusters = 3 time_series = rng.randn(n, sz, d) sizes_all_same_series = [sz] * n_clusters km_euc = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, init="k-means++", random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_series, [ts_size(b) for b in km_euc.cluster_centers_]) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, init="random", random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_series, [ts_size(b) for b in km_dba.cluster_centers_]) # Case 2. forced init barys = to_time_series_dataset([[1., 2., 3.], [1., 2., 2., 3., 4.], [3., 2., 1.]]) sizes_all_same_bary = [barys.shape[1]] * n_clusters # If Euclidean is used, barycenters size should be that of the input series km_euc = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, init=barys, random_state=rng) np.testing.assert_raises(ValueError, km_euc.fit, time_series) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, init=barys, random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_bary, [ts_size(b) for b in km_dba.cluster_centers_]) km_sdtw = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5, verbose=False, init=barys, random_state=rng).fit(time_series) np.testing.assert_equal(sizes_all_same_bary, [ts_size(b) for b in km_sdtw.cluster_centers_]) # A simple dataset, can we extract the correct number of clusters? time_series = to_time_series_dataset([[1, 2, 3], [7, 8, 9, 11], [.1, .2, 2.], [1, 1, 1, 9], [10, 20, 30, 1000]]) preds = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, random_state=rng).fit_predict(time_series) np.testing.assert_equal(set(preds), set(range(3))) preds = TimeSeriesKMeans(n_clusters=4, metric="dtw", max_iter=5, random_state=rng).fit_predict(time_series) np.testing.assert_equal(set(preds), set(range(4)))
def metric_fun(x, y): return cdist_soft_dtw(x, y, **metric_params)
def test_kmeans(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) km = TimeSeriesKMeans(n_clusters=3, metric="euclidean", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist(time_series.reshape((n, -1)), km.cluster_centers_.reshape((3, -1))) np.testing.assert_allclose(km.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km.labels_, km.predict(time_series)) km_dba = TimeSeriesKMeans(n_clusters=3, metric="dtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_dtw(time_series, km_dba.cluster_centers_) np.testing.assert_allclose(km_dba.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_dba.labels_, km_dba.predict(time_series)) km_sdtw = TimeSeriesKMeans(n_clusters=3, metric="softdtw", max_iter=5, verbose=False, random_state=rng).fit(time_series) dists = cdist_soft_dtw(time_series, km_sdtw.cluster_centers_) np.testing.assert_allclose(km_sdtw.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(km_sdtw.labels_, km_sdtw.predict(time_series)) km_nofit = TimeSeriesKMeans(n_clusters=101, verbose=False, random_state=rng).fit(time_series) assert (km_nofit._X_fit is None) X_bis = to_time_series_dataset([[1, 2, 3, 4], [1, 2, 3], [2, 5, 6, 7, 8, 9]]) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="softdtw", random_state=0).fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="random").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", random_state=0, init="k-means++").fit(X_bis) TimeSeriesKMeans(n_clusters=2, verbose=False, max_iter=5, metric="dtw", init=X_bis[:2]).fit(X_bis)