def lr_dtw(s1, s2, gamma=0.): """Compute Locally Regularized DTW (LR-DTW) similarity measure between (possibly multidimensional) time series and return it. It is not required that both time series share the same size, but they must be the same dimension. Parameters ---------- s1 A time series s2 Another time series gamma : float (default: 0.) Regularization parameter Returns ------- float Similarity score See Also -------- lr_dtw_path : Get both the matching path and the similarity score for LR-DTW cdist_lr_dtw : Cross similarity matrix between time series datasets dtw : Dynamic Time Warping score dtw_path : Get both the matching path and the similarity score for DTW """ s1 = to_time_series(s1) s2 = to_time_series(s2) return cylr_dtw(s1, s2, gamma=gamma)[0]
def dtw_path(s1, s2, global_constraint=None, sakoe_chiba_radius=1): """Compute Dynamic Time Warping (DTW) similarity measure between (possibly multidimensional) time series and return both the path and the similarity. It is not required that both time series share the same size, but they must be the same dimension. DTW was originally presented in [1]_. Parameters ---------- s1 A time series. s2 Another time series. If not given, self-similarity of dataset1 is returned. global_constraint : {"itakura", "sakoe_chiba"} or None (default: None) Global constraint to restrict admissible paths for DTW. sakoe_chiba_radius : int (default: 1) Radius to be used for Sakoe-Chiba band global constraint. Used only if global_constraint is "sakoe_chiba". Returns ------- list of integer pairs Matching path represented as a list of index pairs. In each pair, the first index corresponds to s1 and the second one corresponds to s2 float Similarity score Examples -------- >>> path, dist = dtw_path([1, 2, 3], [1., 2., 2., 3.]) >>> path [(0, 0), (1, 1), (1, 2), (2, 3)] >>> dist 0.0 >>> dtw_path([1, 2, 3], [1., 2., 2., 3., 4.])[1] 1.0 See Also -------- dtw : Get only the similarity score for DTW cdist_dtw : Cross similarity matrix between time series datasets References ---------- .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for spoken word recognition," IEEE Transactions on Acoustics, Speech and Signal Processing, vol. 26(1), pp. 43--49, 1978. """ s1 = to_time_series(s1) s2 = to_time_series(s2) sz1 = s1.shape[0] sz2 = s2.shape[0] if global_constraint == "sakoe_chiba": return cydtw_path(s1, s2, mask=sakoe_chiba_mask(sz1, sz2, radius=sakoe_chiba_radius)) elif global_constraint == "itakura": return cydtw_path(s1, s2, mask=itakura_mask(sz1, sz2)) return cydtw_path(s1, s2, mask=numpy.zeros((sz1, sz2)))
def dtw(s1, s2, global_constraint=None, sakoe_chiba_radius=1): """Compute Dynamic Time Warping (DTW) similarity measure between (possibly multidimensional) time series and return it. DTW is computed as the Euclidean distance between aligned time series, i.e., if :math:`P` is the alignment path: .. math:: DTW(X, Y) = \\sqrt{\\sum_{(i, j) \\in P} \|X_{i} - Y_{j}\|_2^2} It is not required that both time series share the same size, but they must be the same dimension. DTW was originally presented in [1]_. Parameters ---------- s1 A time series. s2 Another time series. global_constraint : {"itakura", "sakoe_chiba"} or None (default: None) Global constraint to restrict admissible paths for DTW. sakoe_chiba_radius : int (default: 1) Radius to be used for Sakoe-Chiba band global constraint. Used only if global_constraint is "sakoe_chiba". Returns ------- float Similarity score Examples -------- >>> dtw([1, 2, 3], [1., 2., 2., 3.]) 0.0 >>> dtw([1, 2, 3], [1., 2., 2., 3., 4.]) 1.0 See Also -------- dtw_path : Get both the matching path and the similarity score for DTW cdist_dtw : Cross similarity matrix between time series datasets References ---------- .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for spoken word recognition," IEEE Transactions on Acoustics, Speech and Signal Processing, vol. 26(1), pp. 43--49, 1978. """ s1 = to_time_series(s1) s2 = to_time_series(s2) sz1 = s1.shape[0] sz2 = s2.shape[0] if global_constraint == "sakoe_chiba": return cydtw(s1, s2, mask=sakoe_chiba_mask(sz1, sz2, radius=sakoe_chiba_radius)) elif global_constraint == "itakura": return cydtw(s1, s2, mask=itakura_mask(sz1, sz2)) return cydtw(s1, s2, mask=numpy.zeros((sz1, sz2)))
def lr_dtw_path(s1, s2, gamma=0.): """Compute Locally Regularized DTW (LR-DTW) similarity measure between (possibly multidimensional) time series and return both the (probabilistic) path and the similarity. It is not required that both time series share the same size, but they must be the same dimension. Parameters ---------- s1 A time series s2 Another time series gamma : float (default: 0.) Regularization parameter Returns ------- numpy.ndarray of shape (s1.shape[0], s2.shape[0]) Matching path represented as a probability map float Similarity score See Also -------- lr_dtw : LR-DTW score dtw : Dynamic Time Warping (DTW) score dtw_path : Get both the matching path and the similarity score for DTW """ s1 = to_time_series(s1) s2 = to_time_series(s2) sim, probas = cylr_dtw(s1, s2, gamma=gamma) path = cylr_dtw_backtrace(probas) return path, sim
def dtw_subsequence_path(subseq, longseq): r"""Compute sub-sequence Dynamic Time Warping (DTW) similarity measure between a (possibly multidimensional) query and a long time series and return both the path and the similarity. DTW is computed as the Euclidean distance between aligned time series, i.e., if :math:`\pi` is the alignment path: .. math:: DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} \|X_{i} - Y_{j}\|^2} Compared to traditional DTW, here, border constraints on admissible paths :math:`\pi` are relaxed such that :math:`\pi_0 = (0, ?)` and :math:`\pi_L = (N-1, ?)` where :math:`L` is the length of the considered path and :math:`N` is the length of the subsequence time series. It is not required that both time series share the same size, but they must be the same dimension. This implementation finds the best matching starting and ending positions for `subseq` inside `longseq`. Parameters ---------- subseq : array, shape = (sz1, d) A query time series. longseq : array, shape = (sz2, d) A reference (supposed to be longer than `subseq`) time series. Returns ------- list of integer pairs Matching path represented as a list of index pairs. In each pair, the first index corresponds to `subseq` and the second one corresponds to `longseq`. float Similarity score Examples -------- >>> path, dist = dtw_subsequence_path([2., 3.], [1., 2., 2., 3., 4.]) >>> path [(0, 2), (1, 3)] >>> dist 0.0 See Also -------- dtw : Get the similarity score for DTW subsequence_cost_matrix: Calculate the required cost matrix subsequence_path: Calculate a matching path manually """ subseq = to_time_series(subseq) longseq = to_time_series(longseq) acc_cost_mat = subsequence_cost_matrix(subseq=subseq, longseq=longseq) global_optimal_match = numpy.argmin(acc_cost_mat[-1, :]) path = subsequence_path(acc_cost_mat, global_optimal_match) return path, numpy.sqrt(acc_cost_mat[-1, :][global_optimal_match])
def lb_keogh(ts_query, ts_candidate=None, radius=1, envelope_candidate=None): """Compute LB_Keogh. LB_Keogh was originally presented in [1]_. Parameters ---------- ts_query : array-like Query time-series to compare to the envelope of the candidate. ts_candidate : array-like or None (default: None) Candidate time-series. None means the envelope is provided via `envelope_query` parameter and hence does not need to be computed again. radius : int (default: 1) Radius to be used for the envelope generation (the envelope at time index i will be generated based on all observations from the candidate time series at indices comprised between i-radius and i+radius). Not used if `ts_candidate` is None. envelope_candidate: pair of array-like (envelope_down, envelope_up) or None (default: None) Pre-computed envelope of the candidate time series. If set to None, it is computed based on `ts_candidate`. Returns ------- float Distance between the query time series and the envelope of the candidate time series. Examples -------- >>> ts1 = [1, 2, 3, 2, 1] >>> ts2 = [0, 0, 0, 0, 0] >>> env_low, env_up = lb_envelope(ts1, radius=1) >>> lb_keogh(ts_query=ts2, envelope_candidate=(env_low, env_up)) # doctest: +ELLIPSIS 2.8284... >>> lb_keogh(ts_query=ts2, ts_candidate=ts1, radius=1) # doctest: +ELLIPSIS 2.8284... See also -------- lb_envelope : Compute LB_Keogh-related envelope References ---------- .. [1] Keogh, E. Exact indexing of dynamic time warping. In International Conference on Very Large Data Bases, 2002. pp 406-417. """ if ts_candidate is None: envelope_down, envelope_up = envelope_candidate else: ts_candidate = to_time_series(ts_candidate) assert ts_candidate.shape[ 1] == 1, "LB_Keogh is available only for monodimensional time series" envelope_down, envelope_up = lb_envelope(ts_candidate, radius) ts_query = to_time_series(ts_query) assert ts_query.shape[ 1] == 1, "LB_Keogh is available only for monodimensional time series" indices_up = ts_query[:, 0] > envelope_up[:, 0] indices_down = ts_query[:, 0] < envelope_down[:, 0] return numpy.sqrt(numpy.linalg.norm(ts_query[indices_up, 0] - envelope_up[indices_up, 0]) ** 2 + \ numpy.linalg.norm(ts_query[indices_down, 0] - envelope_down[indices_down, 0]) ** 2)
def test_dtw_subseq_path(): subseq, longseq = [1, 4], [1., 2., 2., 3., 4.] subseq = to_time_series(subseq) longseq = to_time_series(longseq) cost_matrix = tslearn.metrics.subsequence_cost_matrix(subseq, longseq) path = tslearn.metrics.subsequence_path(cost_matrix, 3) np.testing.assert_equal(path, [(0, 2), (1, 3)]) path = tslearn.metrics.subsequence_path(cost_matrix, 1) np.testing.assert_equal(path, [(0, 0), (1, 1)])
def unnormalized_gak(s1, s2, sigma=1.): r"""Compute Global Alignment Kernel (GAK) between (possibly multidimensional) time series and return it. It is not required that both time series share the same size, but they must be the same dimension. GAK was originally presented in [1]_. This is an unnormalized version. Parameters ---------- s1 A time series s2 Another time series sigma : float (default 1.) Bandwidth of the internal gaussian kernel used for GAK Returns ------- float Kernel value Examples -------- >>> unnormalized_gak([1, 2, 3], ... [1., 2., 2., 3.], ... sigma=2.) # doctest: +ELLIPSIS 15.358... >>> unnormalized_gak([1, 2, 3], ... [1., 2., 2., 3., 4.]) # doctest: +ELLIPSIS 3.166... See Also -------- gak : normalized version of GAK that ensures that k(x,x) = 1 cdist_gak : Compute cross-similarity matrix using Global Alignment kernel References ---------- .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011. """ s1 = to_time_series(s1, remove_nans=True) s2 = to_time_series(s2, remove_nans=True) gram = _gak_gram(s1, s2, sigma=sigma) gak_val = njit_gak(s1, s2, gram) return gak_val
def transform( self, series: np.ndarray, sample_period: int = 6) -> Union[np.ndarray, Iterable, int, float]: if isinstance(self.transformer, approximation.DiscreteFourierTransform): n_coefs = self.transformer.n_coefs series = tsutils.to_time_series(series) series = np.reshape(series, (1, -1)) n_samples, n_timestamps = series.shape self.transformer.drop_sum = True X_dft = self.transformer.fit_transform(series) # Compute the inverse transformation if n_coefs % 2 == 0: real_idx = np.arange(1, n_coefs, 2) imag_idx = np.arange(2, n_coefs, 2) X_dft_new = np.c_[X_dft[:, :1], X_dft[:, real_idx] + 1j * np.c_[X_dft[:, imag_idx], np.zeros((n_samples, ))]] else: real_idx = np.arange(1, n_coefs, 2) imag_idx = np.arange(2, n_coefs + 1, 2) X_dft_new = np.c_[X_dft[:, :1], X_dft[:, real_idx] + 1j * X_dft[:, imag_idx]] X_irfft = np.fft.irfft(X_dft_new, n_timestamps) debug('PytsTransformerWrapper ts_representation shape {}'.format( np.shape(X_irfft))) return np.ravel(X_irfft) else: raise Exception('Pyts doesn\'t support trasform')
def test_shapelets(): pytest.importorskip('keras') from tslearn.shapelets import ShapeletModel n, sz, d = 15, 10, 2 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) y = rng.randint(2, size=n) clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, optimizer="sgd", random_state=0) clf.fit(time_series, y) np.testing.assert_allclose(clf.shapelets_[0], np.array([[0.56373, 0.494684], [1.235707, 1.119235]]), atol=1e-2) np.testing.assert_allclose( clf.predict(time_series), np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0])) cross_validate(clf, time_series, y, cv=2) model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1) model.fit(time_series, y) for shp, shp_bis in zip(model.shapelets_, model.shapelets_as_time_series_): np.testing.assert_allclose(shp, to_time_series(shp_bis, remove_nans=True))
def dataToSeries(dataset): rowArray = [] # colArray = [] for i in range(0, len(dataset)): row = mapper(dataset[i]) rowArray.append(row) # colArray.append(col) return to_time_series(rowArray)
def shapelets_(self): total_n_shp = sum(self.n_shapelets_per_size.values()) shapelets = numpy.empty((total_n_shp, ), dtype=object) idx = 0 for i in range(self._n_shapelet_sizes): for shp in self.model.get_layer("shapelets_%d" % i).get_weights()[0]: shapelets[idx] = to_time_series(shp) idx += 1 assert idx == total_n_shp return shapelets
def __init__(self, X, Y): """ Parameters ---------- X: array, shape = [m, d] First time series. Y: array, shape = [n, d] Second time series. Examples -------- >>> SquaredEuclidean([1, 2, 2, 3], [1, 2, 3, 4]).compute() array([[ 0., 1., 4., 9.], [ 1., 0., 1., 4.], [ 1., 0., 1., 4.], [ 4., 1., 0., 1.]]) """ self.X = to_time_series(X).astype(numpy.float64) self.Y = to_time_series(Y).astype(numpy.float64)
def dtw_subsequence_path(subseq, longseq): """Compute sub-sequence Dynamic Time Warping (DTW) similarity measure between a (possibly multidimensional) query and a long time series and return both the path and the similarity. DTW is computed as the Euclidean distance between aligned time series, i.e., if :math:`P` is the alignment path: $$DTW(X, Y) = \\\sqrt{\\\sum_{(i, j) \\\in P} (X_{i} - Y_{j})^2}$$ It is not required that both time series share the same size, but they must be the same dimension. This implementation finds the best matching starting and ending positions for `subseq` inside `longseq`. Parameters ---------- subseq A query time series. longseq A reference (supposed to be longer than `subseq`) time series. Returns ------- list of integer pairs Matching path represented as a list of index pairs. In each pair, the first index corresponds to `subseq` and the second one corresponds to `longseq` float Similarity score Examples -------- >>> path, dist = dtw_subsequence_path([2, 3], [1., 2., 2., 3., 4.]) >>> path [(0, 2), (1, 3)] >>> dist 0.0 See Also -------- dtw : Get the similarity score for DTW """ subseq = to_time_series(subseq) longseq = to_time_series(longseq) return cydtw_subsequence_path(subseq=subseq, longseq=longseq)
def gak(s1, s2, sigma=1.): """Compute Global Alignment Kernel (GAK) between (possibly multidimensional) time series and return it. It is not required that both time series share the same size, but they must be the same dimension. GAK was originally presented in [1]_. This is a normalized version that ensures that $k(x,x)=1$ for all $x$ and $k(x,y) \in [0, 1]$ for all $x, y$. Parameters ---------- s1 A time series s2 Another time series sigma : float (default 1.) Bandwidth of the internal gaussian kernel used for GAK Returns ------- float Kernel value Examples -------- >>> gak([1, 2, 3], [1., 2., 2., 3.], sigma=2.) # doctest: +ELLIPSIS 0.839... >>> gak([1, 2, 3], [1., 2., 2., 3., 4.]) # doctest: +ELLIPSIS 0.273... See Also -------- cdist_gak : Compute cross-similarity matrix using Global Alignment kernel References ---------- .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011. """ s1 = to_time_series(s1) s2 = to_time_series(s2) return cynormalized_gak(s1, s2, sigma)
def lb_envelope(ts, radius=1): r"""Compute time-series envelope as required by LB_Keogh. LB_Keogh was originally presented in [1]_. Parameters ---------- ts : array-like Time-series for which the envelope should be computed. radius : int (default: 1) Radius to be used for the envelope generation (the envelope at time index i will be generated based on all observations from the time series at indices comprised between i-radius and i+radius). Returns ------- array-like Lower-side of the envelope. array-like Upper-side of the envelope. Examples -------- >>> ts1 = [1, 2, 3, 2, 1] >>> env_low, env_up = lb_envelope(ts1, radius=1) >>> env_low array([[1.], [1.], [2.], [1.], [1.]]) >>> env_up array([[2.], [3.], [3.], [3.], [2.]]) See also -------- lb_keogh : Compute LB_Keogh similarity References ---------- .. [1] Keogh, E. Exact indexing of dynamic time warping. In International Conference on Very Large Data Bases, 2002. pp 406-417. """ return njit_lb_envelope(to_time_series(ts), radius=radius)
def lb_envelope(ts, radius=1): """Compute time-series envelope as required by LB_Keogh. LB_Keogh was originally presented in [1]_ and the multivariate extention in [2]_. Parameters ---------- ts : array-like Time-series for which the envelope should be computed. radius : int (default: 1) Radius to be used for the envelope generation (the envelope at time index i will be generated based on all observations from the time series at indices comprised between i-radius and i+radius). Returns ------- array-like Lower-side of the envelope. array-like Upper-side of the envelope. Examples -------- >>> ts1 = [1, 2, 3, 2, 1] >>> env_low, env_up = lb_envelope(ts1, radius=1) >>> env_low array([[ 1.], [ 1.], [ 2.], [ 1.], [ 1.]]) >>> env_up array([[ 2.], [ 3.], [ 3.], [ 3.], [ 2.]]) See also -------- lb_keogh : Compute LB_Keogh similarity References ---------- .. [1] Keogh, E. Exact indexing of dynamic time warping. In International Conference on Very Large Data Bases, 2002. pp 406-417. .. [2] Rath, T. M., & Manmatha, R. Lower-bounding of dynamic time warping distances for multivariate time series. University of Massachusetts Amherst Technical Report MM, 40, 2002. """ return cylb_envelope(to_time_series(ts), radius=radius)
def test_shapelets(): pytest.importorskip('tensorflow') from tslearn.shapelets import ShapeletModel import tensorflow as tf n, sz, d = 15, 10, 2 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) y = rng.randint(2, size=n) clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, optimizer="sgd", random_state=0) cross_validate(clf, time_series, y, cv=2) clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, optimizer=tf.optimizers.Adam(.1), random_state=0) cross_validate(clf, time_series, y, cv=2) model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1) model.fit(time_series, y) for shp, shp_bis in zip(model.shapelets_, model.shapelets_as_time_series_): np.testing.assert_allclose(shp, to_time_series(shp_bis, remove_nans=True)) # Test set_weights / get_weights clf = ShapeletModel(n_shapelets_per_size={2: 5}, max_iter=1, verbose=0, random_state=0) clf.fit(time_series, y) preds_before = clf.predict_proba(time_series) weights = clf.get_weights() # Change number of iterations, then refit, then set weights clf.max_iter *= 2 clf.fit(time_series, y) clf.set_weights(weights) np.testing.assert_allclose(preds_before, clf.predict_proba(time_series))
def _func(self, Z): # Compute objective value and grad at Z. Z = Z.reshape(self.barycenter_.shape) G = numpy.zeros_like(Z) obj = 0 for i in range(len(self._X_fit)): D = SquaredEuclidean( Z, to_time_series(self._X_fit[i], remove_nans=True)) sdtw = SoftDTW(D, gamma=self.gamma) value = sdtw.compute() E = sdtw.grad() G_tmp = D.jacobian_product(E) G += self.weights[i] * G_tmp obj += self.weights[i] * value return obj, G.ravel()
def fit(self, X): """Register X as the reference series and interpolate it to get a series of size nsamples. Parameters ---------- X : numpy.ndarray A time series. Returns ------- DTWSampler self """ X = to_time_series(X) end = first_non_finite_index(X) self.reference_series_ = _resampled(X[:end], n_samples=self.n_samples, kind=self.interp_kind) return self
def to_timeseries_set(df, var): ids = list(df['studyID'].unique()) selected_ids = random.sample(ids, 10) print(len(ids)) ts = [] empty = 0 count = 0 for Id in selected_ids: count += 1 sub_df, stop = select_data(df, Id, var) if len(sub_df.index) > 1: x = sub_df.loc[sub_df['value_name'] == variables[var] ['Name']]['Reltime'] y = sub_df.loc[sub_df['value_name'] == variables[var] ['Name']]['value'] _, y_pred, _ = get_gaussian(x, y, stop, var) y_pred = to_time_series(y_pred) ts.append(y_pred) dat = to_time_series_dataset(ts) print(dat.shape) print("nr ids: ", count) return dat, empty
def dtw_path_limited_warping_length(s1, s2, max_length): r"""Compute Dynamic Time Warping (DTW) similarity measure between (possibly multidimensional) time series under an upper bound constraint on the resulting path length and return the path as well as the similarity cost. DTW is computed as the Euclidean distance between aligned time series, i.e., if :math:`\pi` is the optimal alignment path: .. math:: DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} \|X_{i} - Y_{j}\|^2} Note that this formula is still valid for the multivariate case. It is not required that both time series share the same size, but they must be the same dimension. DTW was originally presented in [1]_. This constrained-length variant was introduced in [2]_. Both variants are discussed in more details in our :ref:`dedicated user-guide page <dtw>` Parameters ---------- s1 A time series. s2 Another time series. max_length : int Maximum allowed warping path length. If greater than len(s1) + len(s2), then it is equivalent to unconstrained DTW. If lower than max(len(s1), len(s2)), no path can be found and a ValueError is raised. Returns ------- list of integer pairs Optimal path float Similarity score Examples -------- >>> path, cost = dtw_path_limited_warping_length([1, 2, 3], ... [1., 2., 2., 3.], 5) >>> cost 0.0 >>> path [(0, 0), (1, 1), (1, 2), (2, 3)] >>> path, cost = dtw_path_limited_warping_length([1, 2, 3], ... [1., 2., 2., 3., 4.], 5) >>> cost 1.0 >>> path [(0, 0), (1, 1), (1, 2), (2, 3), (2, 4)] See Also -------- dtw_limited_warping_length : Get the similarity score for DTW with limited warping path length dtw_path : Get both the matching path and the similarity score for DTW References ---------- .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for spoken word recognition," IEEE Transactions on Acoustics, Speech and Signal Processing, vol. 26(1), pp. 43--49, 1978. .. [2] Z. Zhang, R. Tavenard, A. Bailly, X. Tang, P. Tang, T. Corpetti Dynamic time warping under limited warping path length. Information Sciences, vol. 393, pp. 91--107, 2017. """ s1 = to_time_series(s1, remove_nans=True) s2 = to_time_series(s2, remove_nans=True) if max_length < max(s1.shape[0], s2.shape[0]): raise ValueError("Cannot find a path of length {} to align given " "time series.".format(max_length)) accumulated_costs = _limited_warping_length_cost(s1, s2, max_length) idx_pair = (s1.shape[0] - 1, s2.shape[0] - 1) optimal_length = -1 optimal_cost = numpy.inf for k, v in accumulated_costs[idx_pair].items(): if v < optimal_cost: optimal_cost = v optimal_length = k path = _return_path_limited_warping_length(accumulated_costs, idx_pair, optimal_length) return path, numpy.sqrt(optimal_cost)
normalized_flow = scaler_flow_train.transform(flow) normalized_flow #from array to list normalized_flow = normalized_flow.tolist() len(normalized_flow) from toolz.itertoolz import sliding_window, partition #for every day of the train set store the flow observations day_flow = list(partition(240, normalized_flow)) day_flow len(day_flow) #from list to multidimensional array day_flow = np.asarray(day_flow) day_flow from tslearn.utils import to_time_series, to_time_series_dataset #create univariate series for normalized flow_observation first_time_series = to_time_series(day_flow) print(first_time_series.shape) #treatment of density variable density = df.loc[:, 'Density'] #normalization/standardization of train data density = np.array(density) density = density.reshape((len(density), 1)) #fit train data scaler_density_train = scaler.fit(density) print('Min: %f, Max: %f' % (scaler_density_train.data_min_, scaler_density_train.data_max_)) #scale train data normalized_density = scaler_density_train.transform(density) normalized_density #from array to list
documents[["year", "month"]]) # names of all years and months without duplicates months_grouped = documents[["year", "month"]].drop_duplicates() # making an DataFrame to store words in column names and dates in indexes. tfidf_monthly_dataframe = pd.DataFrame( tfidf_avg_monthly.toarray(), columns=vocabulary["word"], index=pd.to_datetime({ "year": months_grouped.year, "month": months_grouped.month, "day": 1 }), ) # time series - each row(word) is one time serie # each time series is an array of 72 months. time_series = to_time_series(tfidf_monthly_dataframe.values.transpose()) N_clusters = 7 model = TimeSeriesKMeans(N_clusters) vocabulary["cluster"] = model.fit_predict(time_series) # mapping cluster numbers to colors colors = pd.DataFrame(pl.cm.jet(np.linspace(0, 1, N_clusters))) vocabulary.sort_values(["cluster", "relevance"], inplace=True, ascending=False) # getting the most relevant words for each topic topics = (vocabulary[["cluster", "word"]].groupby("cluster").agg({ "word": lambda words: ", ".join(words[:15]), })).reset_index().rename({'Index': 'cluster'}) clusters_centers = pd.DataFrame( model.cluster_centers_.reshape((N_clusters, -1)).transpose(),
from tslearn.utils import to_time_series my_first_time_series = [1, 3, 4, 2] formatted_time_series = to_time_series(my_first_time_series) print(formatted_time_series.shape)
def sklearn_metric(x, y): return metric(to_time_series(x.reshape((sz, d)), remove_nans=True), to_time_series(y.reshape((sz, d)), remove_nans=True))
normalized_flow = scaler_flow_train.transform(flow) normalized_flow #from array to list normalized_flow = normalized_flow.tolist() len(normalized_flow) from toolz.itertoolz import sliding_window, partition #for every day of the train set store the flow observations day_flow = list(partition(48, normalized_flow)) day_flow len(day_flow) #from list to multidimensional array day_flow = np.asarray(day_flow) day_flow from tslearn.utils import to_time_series, to_time_series_dataset #create univariate series for normalized flow_observation first_time_series = to_time_series(day_flow) print(first_time_series.shape) #treatment of speed variable speed = df.loc[:, 'Speed'] #normalization/standardization of train data speed = np.array(speed) speed = speed.reshape((len(speed), 1)) #fit train data scaler_speed_train = scaler.fit(speed) print('Min: %f, Max: %f' % (scaler_speed_train.data_min_, scaler_speed_train.data_max_)) #scale train data normalized_speed = scaler_speed_train.transform(speed) normalized_speed #from array to list
def softdtw_barycenter(X, gamma=1.0, weights=None, method="L-BFGS-B", tol=1e-3, max_iter=50, init=None): """Compute barycenter (time series averaging) under the soft-DTW [1] geometry. Soft-DTW was originally presented in [1]_. Parameters ---------- X : array-like, shape=(n_ts, sz, d) Time series dataset. gamma: float Regularization parameter. Lower is less smoothed (closer to true DTW). weights: None or array Weights of each X[i]. Must be the same size as len(X). If None, uniform weights are used. method: string Optimization method, passed to `scipy.optimize.minimize`. Default: L-BFGS. tol: float Tolerance of the method used. max_iter: int Maximum number of iterations. init: array or None (default: None) Initial barycenter to start from for the optimization process. If `None`, euclidean barycenter is used as a starting point. Returns ------- numpy.array of shape (bsz, d) where `bsz` is the size of the `init` array \ if provided or `sz` otherwise Soft-DTW barycenter of the provided time series dataset. Examples -------- >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]] >>> softdtw_barycenter(time_series, max_iter=5) array([[1.25161574], [2.03821705], [3.5101956 ], [4.36140605]]) >>> time_series = [[1, 2, 3, 4], [1, 2, 3, 4, 5]] >>> softdtw_barycenter(time_series, max_iter=5) array([[1.21349933], [1.8932251 ], [2.67573269], [3.51057026], [4.33645802]]) References ---------- .. [1] M. Cuturi, M. Blondel "Soft-DTW: a Differentiable Loss Function for Time-Series," ICML 2017. """ X_ = to_time_series_dataset(X) weights = _set_weights(weights, X_.shape[0]) if init is None: if check_equal_size(X_): barycenter = euclidean_barycenter(X_, weights) else: resampled_X = TimeSeriesResampler(sz=X_.shape[1]).fit_transform(X_) barycenter = euclidean_barycenter(resampled_X, weights) else: barycenter = init if max_iter > 0: X_ = numpy.array([to_time_series(d, remove_nans=True) for d in X_]) def f(Z): return _softdtw_func(Z, X_, weights, barycenter, gamma) # The function works with vectors so we need to vectorize barycenter. res = minimize(f, barycenter.ravel(), method=method, jac=True, tol=tol, options=dict(maxiter=max_iter, disp=False)) return res.x.reshape(barycenter.shape) else: return barycenter
def softdtw_barycenter(X, gamma=1.0, weights=None, method="L-BFGS-B", tol=1e-3, max_iter=50, init=None): """Compute barycenter (time series averaging) under the soft-DTW geometry. Parameters ---------- X : array-like, shape=(n_ts, sz, d) Time series dataset. gamma: float Regularization parameter. Lower is less smoothed (closer to true DTW). weights: None or array Weights of each X[i]. Must be the same size as len(X). method: string Optimization method, passed to `scipy.optimize.minimize`. Default: L-BFGS. tol: float Tolerance of the method used. max_iter: int Maximum number of iterations. init: array or None (default: None) Initial barycenter to start from for the optimization process. If `None`, euclidean barycenter is used as a starting point. Examples -------- >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]] >>> euc_bar = euclidean_barycenter(time_series) >>> stdw_bar = softdtw_barycenter(time_series, max_iter=0) >>> stdw_bar.shape (4, 1) >>> numpy.alltrue(numpy.abs(euc_bar - stdw_bar) < 1e-9) # Because 0 iterations were performed True >>> softdtw_barycenter(time_series, max_iter=5).shape (4, 1) >>> time_series = [[1, 2, 3, 4], [1, 2, 3, 4, 5]] >>> softdtw_barycenter(time_series, max_iter=5).shape (5, 1) """ X_ = to_time_series_dataset(X) weights = _set_weights(weights, X_.shape[0]) if init is None: if check_equal_size(X_): barycenter = euclidean_barycenter(X_, weights) else: resampled_X = TimeSeriesResampler(sz=X_.shape[1]).fit_transform(X_) barycenter = euclidean_barycenter(resampled_X, weights) else: barycenter = init if max_iter > 0: X_ = numpy.array([to_time_series(d, remove_nans=True) for d in X_]) f = lambda Z: _softdtw_func(Z, X_, weights, barycenter, gamma) # The function works with vectors so we need to vectorize barycenter. res = minimize(f, barycenter.ravel(), method=method, jac=True, tol=tol, options=dict(maxiter=max_iter, disp=False)) return res.x.reshape(barycenter.shape) else: return barycenter
def silhouette_score(X, labels, metric=None, sample_size=None, metric_params=None, random_state=None, **kwds): """Compute the mean Silhouette Coefficient of all samples (cf. [1]_ and [2]_). Read more in the `scikit-learn documentation <http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient>`_. Parameters ---------- X : array [n_ts, n_ts] if metric == "precomputed", or, \ [n_ts, sz, d] otherwise Array of pairwise distances between time series, or a time series dataset. labels : array, shape = [n_ts] Predicted labels for each time series. metric : string, or callable The metric to use when calculating distance between time series. Should be one of {'dtw', 'softdtw', 'euclidean'} or a callable distance function. If X is the distance array itself, use ``metric="precomputed"``. sample_size : int or None The size of the sample to use when computing the Silhouette Coefficient on a random subset of the data. If ``sample_size is None``, no sampling is used. metric_params : dict or None Parameter values for the chosen metric. Value associated to the `"gamma_sdtw"` key corresponds to the gamma parameter in Soft-DTW. random_state : int, RandomState instance or None, optional (default=None) The generator used to randomly select a subset of samples. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Used when ``sample_size is not None``. **kwds : optional keyword parameters Any further parameters are passed directly to the distance function. Returns ------- silhouette : float Mean Silhouette Coefficient for all samples. References ---------- .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the Interpretation and Validation of Cluster Analysis". Computational and Applied Mathematics 20: 53-65. <http://www.sciencedirect.com/science/article/pii/0377042787901257>`_ .. [2] `Wikipedia entry on the Silhouette Coefficient <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_ Examples -------- >>> from tslearn.generators import random_walks >>> from tslearn.metrics import cdist_dtw >>> X = random_walks(n_ts=50, sz=32, d=1) >>> labels = numpy.random.randint(2, size=50) >>> s_sc = silhouette_score(X, labels, metric="dtw") >>> s_sc2 = silhouette_score(X, labels, metric="softdtw") >>> s_sc2b = silhouette_score(X, labels, metric="softdtw", metric_params={"gamma_sdtw": 2.}) >>> s_sc3 = silhouette_score(cdist_dtw(X), labels, metric="precomputed") """ sklearn_metric = None if metric_params is None: metric_params = {} if metric == "precomputed": sklearn_X = X elif metric == "dtw": sklearn_X = cdist_dtw(X) elif metric == "softdtw": gamma = metric_params.get("gamma_sdtw", None) if gamma is not None: sklearn_X = cdist_soft_dtw(X, gamma=gamma) else: sklearn_X = cdist_soft_dtw(X) elif metric == "euclidean": sklearn_X = cdist(X, X, metric="euclidean") else: X_ = to_time_series_dataset(X) n, sz, d = X_.shape sklearn_X = X_.reshape((n, -1)) if metric is None: metric = dtw sklearn_metric = lambda x, y: metric( to_time_series(x.reshape((sz, d)), remove_nans=True), to_time_series(y.reshape((sz, d)), remove_nans=True)) return sklearn_silhouette_score( X=sklearn_X, labels=labels, metric="precomputed" if sklearn_metric is None else sklearn_metric, sample_size=sample_size, random_state=random_state, **kwds)