Beispiel #1
0
def lr_dtw(s1, s2, gamma=0.):
    """Compute Locally Regularized DTW (LR-DTW) similarity measure between (possibly multidimensional) time series and
    return it.

    It is not required that both time series share the same size, but they must be the same dimension.

    Parameters
    ----------
    s1
        A time series
    s2
        Another time series
    gamma : float (default: 0.)
        Regularization parameter

    Returns
    -------
    float
        Similarity score

    See Also
    --------
    lr_dtw_path : Get both the matching path and the similarity score for LR-DTW
    cdist_lr_dtw : Cross similarity matrix between time series datasets
    dtw : Dynamic Time Warping score
    dtw_path : Get both the matching path and the similarity score for DTW
    """
    s1 = to_time_series(s1)
    s2 = to_time_series(s2)
    return cylr_dtw(s1, s2, gamma=gamma)[0]
Beispiel #2
0
def dtw_path(s1, s2, global_constraint=None, sakoe_chiba_radius=1):
    """Compute Dynamic Time Warping (DTW) similarity measure between (possibly multidimensional) time series and
    return both the path and the similarity.

    It is not required that both time series share the same size, but they must be the same dimension.
    DTW was originally presented in [1]_.

    Parameters
    ----------
    s1
        A time series.
    s2
        Another time series.
        If not given, self-similarity of dataset1 is returned.
    global_constraint : {"itakura", "sakoe_chiba"} or None (default: None)
        Global constraint to restrict admissible paths for DTW.
    sakoe_chiba_radius : int (default: 1)
        Radius to be used for Sakoe-Chiba band global constraint. Used only if global_constraint is "sakoe_chiba".

    Returns
    -------
    list of integer pairs
        Matching path represented as a list of index pairs. In each pair, the first index corresponds to s1 and the
        second one corresponds to s2
    float
        Similarity score

    Examples
    --------
    >>> path, dist = dtw_path([1, 2, 3], [1., 2., 2., 3.])
    >>> path
    [(0, 0), (1, 1), (1, 2), (2, 3)]
    >>> dist
    0.0
    >>> dtw_path([1, 2, 3], [1., 2., 2., 3., 4.])[1]
    1.0

    See Also
    --------
    dtw : Get only the similarity score for DTW
    cdist_dtw : Cross similarity matrix between time series datasets

    References
    ----------
    .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for spoken word recognition,"
       IEEE Transactions on Acoustics, Speech and Signal Processing, vol. 26(1), pp. 43--49, 1978.
    """
    s1 = to_time_series(s1)
    s2 = to_time_series(s2)
    sz1 = s1.shape[0]
    sz2 = s2.shape[0]
    if global_constraint == "sakoe_chiba":
        return cydtw_path(s1,
                          s2,
                          mask=sakoe_chiba_mask(sz1,
                                                sz2,
                                                radius=sakoe_chiba_radius))
    elif global_constraint == "itakura":
        return cydtw_path(s1, s2, mask=itakura_mask(sz1, sz2))
    return cydtw_path(s1, s2, mask=numpy.zeros((sz1, sz2)))
Beispiel #3
0
def dtw(s1, s2, global_constraint=None, sakoe_chiba_radius=1):
    """Compute Dynamic Time Warping (DTW) similarity measure between (possibly
    multidimensional) time series and return it.

    DTW is computed as the Euclidean distance between aligned time series, i.e.,
    if :math:`P` is the alignment path:

    .. math::
        DTW(X, Y) = \\sqrt{\\sum_{(i, j) \\in P} \|X_{i} - Y_{j}\|_2^2}

    It is not required that both time series share the same size, but they must
    be the same dimension.
    DTW was originally presented in [1]_.

    Parameters
    ----------
    s1
        A time series.
    s2
        Another time series.
    global_constraint : {"itakura", "sakoe_chiba"} or None (default: None)
        Global constraint to restrict admissible paths for DTW.
    sakoe_chiba_radius : int (default: 1)
        Radius to be used for Sakoe-Chiba band global constraint. Used only if
        global_constraint is "sakoe_chiba".

    Returns
    -------
    float
        Similarity score

    Examples
    --------
    >>> dtw([1, 2, 3], [1., 2., 2., 3.])
    0.0
    >>> dtw([1, 2, 3], [1., 2., 2., 3., 4.])
    1.0

    See Also
    --------
    dtw_path : Get both the matching path and the similarity score for DTW
    cdist_dtw : Cross similarity matrix between time series datasets

    References
    ----------
    .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for spoken word recognition,"
       IEEE Transactions on Acoustics, Speech and Signal Processing, vol. 26(1), pp. 43--49, 1978.
    """
    s1 = to_time_series(s1)
    s2 = to_time_series(s2)
    sz1 = s1.shape[0]
    sz2 = s2.shape[0]
    if global_constraint == "sakoe_chiba":
        return cydtw(s1,
                     s2,
                     mask=sakoe_chiba_mask(sz1, sz2,
                                           radius=sakoe_chiba_radius))
    elif global_constraint == "itakura":
        return cydtw(s1, s2, mask=itakura_mask(sz1, sz2))
    return cydtw(s1, s2, mask=numpy.zeros((sz1, sz2)))
Beispiel #4
0
def lr_dtw_path(s1, s2, gamma=0.):
    """Compute Locally Regularized DTW (LR-DTW) similarity measure between (possibly multidimensional) time series and
    return both the (probabilistic) path and the similarity.

    It is not required that both time series share the same size, but they must be the same dimension.

    Parameters
    ----------
    s1
        A time series
    s2
        Another time series
    gamma : float (default: 0.)
        Regularization parameter

    Returns
    -------
    numpy.ndarray of shape (s1.shape[0], s2.shape[0])
        Matching path represented as a probability map
    float
        Similarity score

    See Also
    --------
    lr_dtw : LR-DTW score
    dtw : Dynamic Time Warping (DTW) score
    dtw_path : Get both the matching path and the similarity score for DTW
    """
    s1 = to_time_series(s1)
    s2 = to_time_series(s2)
    sim, probas = cylr_dtw(s1, s2, gamma=gamma)
    path = cylr_dtw_backtrace(probas)
    return path, sim
Beispiel #5
0
def dtw_subsequence_path(subseq, longseq):
    r"""Compute sub-sequence Dynamic Time Warping (DTW) similarity measure
    between a (possibly multidimensional) query and a long time series and
    return both the path and the similarity.

    DTW is computed as the Euclidean distance between aligned time series,
    i.e., if :math:`\pi` is the alignment path:

    .. math::

        DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} \|X_{i} - Y_{j}\|^2}

    Compared to traditional DTW, here, border constraints on admissible paths
    :math:`\pi` are relaxed such that :math:`\pi_0 = (0, ?)` and
    :math:`\pi_L = (N-1, ?)` where :math:`L` is the length of the considered
    path and :math:`N` is the length of the subsequence time series.

    It is not required that both time series share the same size, but they must
    be the same dimension. This implementation finds the best matching starting
    and ending positions for `subseq` inside `longseq`.

    Parameters
    ----------
    subseq : array, shape = (sz1, d)
        A query time series.
    longseq : array, shape = (sz2, d)
        A reference (supposed to be longer than `subseq`) time series.

    Returns
    -------
    list of integer pairs
        Matching path represented as a list of index pairs. In each pair, the
        first index corresponds to `subseq` and the second one corresponds to
        `longseq`.
    float
        Similarity score

    Examples
    --------
    >>> path, dist = dtw_subsequence_path([2., 3.], [1., 2., 2., 3., 4.])
    >>> path
    [(0, 2), (1, 3)]
    >>> dist
    0.0

    See Also
    --------
    dtw : Get the similarity score for DTW
    subsequence_cost_matrix: Calculate the required cost matrix
    subsequence_path: Calculate a matching path manually
    """
    subseq = to_time_series(subseq)
    longseq = to_time_series(longseq)
    acc_cost_mat = subsequence_cost_matrix(subseq=subseq,
                                           longseq=longseq)
    global_optimal_match = numpy.argmin(acc_cost_mat[-1, :])
    path = subsequence_path(acc_cost_mat, global_optimal_match)
    return path, numpy.sqrt(acc_cost_mat[-1, :][global_optimal_match])
Beispiel #6
0
def lb_keogh(ts_query, ts_candidate=None, radius=1, envelope_candidate=None):
    """Compute LB_Keogh.

    LB_Keogh was originally presented in [1]_.

    Parameters
    ----------
    ts_query : array-like
        Query time-series to compare to the envelope of the candidate.
    ts_candidate : array-like or None (default: None)
        Candidate time-series. None means the envelope is provided via `envelope_query` parameter and hence does not
        need to be computed again.
    radius : int (default: 1)
        Radius to be used for the envelope generation (the envelope at time index i will be generated based on
        all observations from the candidate time series at indices comprised between i-radius and i+radius). Not used
        if `ts_candidate` is None.
    envelope_candidate: pair of array-like (envelope_down, envelope_up) or None (default: None)
        Pre-computed envelope of the candidate time series. If set to None, it is computed based on `ts_candidate`.

    Returns
    -------
    float
        Distance between the query time series and the envelope of the candidate time series.

    Examples
    --------
    >>> ts1 = [1, 2, 3, 2, 1]
    >>> ts2 = [0, 0, 0, 0, 0]
    >>> env_low, env_up = lb_envelope(ts1, radius=1)
    >>> lb_keogh(ts_query=ts2, envelope_candidate=(env_low, env_up))  # doctest: +ELLIPSIS
    2.8284...
    >>> lb_keogh(ts_query=ts2, ts_candidate=ts1, radius=1)  # doctest: +ELLIPSIS
    2.8284...

    See also
    --------
    lb_envelope : Compute LB_Keogh-related envelope

    References
    ----------
    .. [1] Keogh, E. Exact indexing of dynamic time warping. In International Conference on Very Large Data Bases, 2002.
       pp 406-417.
    """
    if ts_candidate is None:
        envelope_down, envelope_up = envelope_candidate
    else:
        ts_candidate = to_time_series(ts_candidate)
        assert ts_candidate.shape[
            1] == 1, "LB_Keogh is available only for monodimensional time series"
        envelope_down, envelope_up = lb_envelope(ts_candidate, radius)
    ts_query = to_time_series(ts_query)
    assert ts_query.shape[
        1] == 1, "LB_Keogh is available only for monodimensional time series"
    indices_up = ts_query[:, 0] > envelope_up[:, 0]
    indices_down = ts_query[:, 0] < envelope_down[:, 0]
    return numpy.sqrt(numpy.linalg.norm(ts_query[indices_up, 0] - envelope_up[indices_up, 0]) ** 2 + \
                      numpy.linalg.norm(ts_query[indices_down, 0] - envelope_down[indices_down, 0]) ** 2)
Beispiel #7
0
def test_dtw_subseq_path():
    subseq, longseq = [1, 4], [1., 2., 2., 3., 4.]
    subseq = to_time_series(subseq)
    longseq = to_time_series(longseq)
    cost_matrix = tslearn.metrics.subsequence_cost_matrix(subseq, longseq)

    path = tslearn.metrics.subsequence_path(cost_matrix, 3)
    np.testing.assert_equal(path, [(0, 2), (1, 3)])

    path = tslearn.metrics.subsequence_path(cost_matrix, 1)
    np.testing.assert_equal(path, [(0, 0), (1, 1)])
Beispiel #8
0
def unnormalized_gak(s1, s2, sigma=1.):
    r"""Compute Global Alignment Kernel (GAK) between (possibly
    multidimensional) time series and return it.

    It is not required that both time series share the same size, but they must
    be the same dimension. GAK was
    originally presented in [1]_.
    This is an unnormalized version.

    Parameters
    ----------
    s1
        A time series
    s2
        Another time series
    sigma : float (default 1.)
        Bandwidth of the internal gaussian kernel used for GAK

    Returns
    -------
    float
        Kernel value

    Examples
    --------
    >>> unnormalized_gak([1, 2, 3],
    ...                  [1., 2., 2., 3.],
    ...                  sigma=2.)  # doctest: +ELLIPSIS
    15.358...
    >>> unnormalized_gak([1, 2, 3],
    ...                  [1., 2., 2., 3., 4.])  # doctest: +ELLIPSIS
    3.166...

    See Also
    --------
    gak : normalized version of GAK that ensures that k(x,x) = 1
    cdist_gak : Compute cross-similarity matrix using Global Alignment kernel

    References
    ----------
    .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011.
    """
    s1 = to_time_series(s1, remove_nans=True)
    s2 = to_time_series(s2, remove_nans=True)

    gram = _gak_gram(s1, s2, sigma=sigma)

    gak_val = njit_gak(s1, s2, gram)
    return gak_val
Beispiel #9
0
    def transform(
            self,
            series: np.ndarray,
            sample_period: int = 6) -> Union[np.ndarray, Iterable, int, float]:
        if isinstance(self.transformer,
                      approximation.DiscreteFourierTransform):
            n_coefs = self.transformer.n_coefs
            series = tsutils.to_time_series(series)
            series = np.reshape(series, (1, -1))
            n_samples, n_timestamps = series.shape
            self.transformer.drop_sum = True
            X_dft = self.transformer.fit_transform(series)

            # Compute the inverse transformation
            if n_coefs % 2 == 0:
                real_idx = np.arange(1, n_coefs, 2)
                imag_idx = np.arange(2, n_coefs, 2)
                X_dft_new = np.c_[X_dft[:, :1], X_dft[:, real_idx] +
                                  1j * np.c_[X_dft[:, imag_idx],
                                             np.zeros((n_samples, ))]]
            else:
                real_idx = np.arange(1, n_coefs, 2)
                imag_idx = np.arange(2, n_coefs + 1, 2)
                X_dft_new = np.c_[X_dft[:, :1],
                                  X_dft[:, real_idx] + 1j * X_dft[:, imag_idx]]
            X_irfft = np.fft.irfft(X_dft_new, n_timestamps)
            debug('PytsTransformerWrapper ts_representation shape {}'.format(
                np.shape(X_irfft)))

            return np.ravel(X_irfft)
        else:
            raise Exception('Pyts doesn\'t support trasform')
Beispiel #10
0
def test_shapelets():
    pytest.importorskip('keras')
    from tslearn.shapelets import ShapeletModel

    n, sz, d = 15, 10, 2
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    y = rng.randint(2, size=n)
    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        optimizer="sgd",
                        random_state=0)
    clf.fit(time_series, y)
    np.testing.assert_allclose(clf.shapelets_[0],
                               np.array([[0.56373, 0.494684],
                                         [1.235707, 1.119235]]),
                               atol=1e-2)
    np.testing.assert_allclose(
        clf.predict(time_series),
        np.array([0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0]))

    cross_validate(clf, time_series, y, cv=2)

    model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1)
    model.fit(time_series, y)
    for shp, shp_bis in zip(model.shapelets_, model.shapelets_as_time_series_):
        np.testing.assert_allclose(shp,
                                   to_time_series(shp_bis, remove_nans=True))
Beispiel #11
0
def dataToSeries(dataset):
    rowArray = []
    # colArray = []
    for i in range(0, len(dataset)):
        row = mapper(dataset[i])
        rowArray.append(row)
        # colArray.append(col)
    return to_time_series(rowArray)
Beispiel #12
0
 def shapelets_(self):
     total_n_shp = sum(self.n_shapelets_per_size.values())
     shapelets = numpy.empty((total_n_shp, ), dtype=object)
     idx = 0
     for i in range(self._n_shapelet_sizes):
         for shp in self.model.get_layer("shapelets_%d" % i).get_weights()[0]:
             shapelets[idx] = to_time_series(shp)
             idx += 1
     assert idx == total_n_shp
     return shapelets
Beispiel #13
0
    def __init__(self, X, Y):
        """
        Parameters
        ----------
        X: array, shape = [m, d]
            First time series.
        Y: array, shape = [n, d]
            Second time series.

        Examples
        --------
        >>> SquaredEuclidean([1, 2, 2, 3], [1, 2, 3, 4]).compute()
        array([[ 0.,  1.,  4.,  9.],
               [ 1.,  0.,  1.,  4.],
               [ 1.,  0.,  1.,  4.],
               [ 4.,  1.,  0.,  1.]])
        """
        self.X = to_time_series(X).astype(numpy.float64)
        self.Y = to_time_series(Y).astype(numpy.float64)
Beispiel #14
0
def dtw_subsequence_path(subseq, longseq):
    """Compute sub-sequence Dynamic Time Warping (DTW) similarity measure between a (possibly multidimensional)
    query and a long time series and return both the path and the similarity.

    DTW is computed as the Euclidean distance between aligned time series, i.e., if :math:`P` is the alignment path:
    $$DTW(X, Y) = \\\sqrt{\\\sum_{(i, j) \\\in P} (X_{i} - Y_{j})^2}$$

    It is not required that both time series share the same size, but they must be the same dimension.
    This implementation finds the best matching starting and ending positions for `subseq` inside `longseq`.

    Parameters
    ----------
    subseq
        A query time series.
    longseq
        A reference (supposed to be longer than `subseq`) time series.

    Returns
    -------
    list of integer pairs
        Matching path represented as a list of index pairs. In each pair, the first index corresponds to `subseq` and
        the second one corresponds to `longseq`
    float
        Similarity score

    Examples
    --------
    >>> path, dist = dtw_subsequence_path([2, 3], [1., 2., 2., 3., 4.])
    >>> path
    [(0, 2), (1, 3)]
    >>> dist
    0.0

    See Also
    --------
    dtw : Get the similarity score for DTW
    """
    subseq = to_time_series(subseq)
    longseq = to_time_series(longseq)
    return cydtw_subsequence_path(subseq=subseq, longseq=longseq)
Beispiel #15
0
def gak(s1, s2, sigma=1.):
    """Compute Global Alignment Kernel (GAK) between (possibly multidimensional) time series and return it.

    It is not required that both time series share the same size, but they must be the same dimension. GAK was
    originally presented in [1]_.
    This is a normalized version that ensures that $k(x,x)=1$ for all $x$ and $k(x,y) \in [0, 1]$ for all $x, y$.

    Parameters
    ----------
    s1
        A time series
    s2
        Another time series
    sigma : float (default 1.)
        Bandwidth of the internal gaussian kernel used for GAK

    Returns
    -------
    float
        Kernel value

    Examples
    --------
    >>> gak([1, 2, 3], [1., 2., 2., 3.], sigma=2.)  # doctest: +ELLIPSIS
    0.839...
    >>> gak([1, 2, 3], [1., 2., 2., 3., 4.])  # doctest: +ELLIPSIS
    0.273...

    See Also
    --------
    cdist_gak : Compute cross-similarity matrix using Global Alignment kernel

    References
    ----------
    .. [1] M. Cuturi, "Fast global alignment kernels," ICML 2011.
    """
    s1 = to_time_series(s1)
    s2 = to_time_series(s2)
    return cynormalized_gak(s1, s2, sigma)
Beispiel #16
0
def lb_envelope(ts, radius=1):
    r"""Compute time-series envelope as required by LB_Keogh.

    LB_Keogh was originally presented in [1]_.

    Parameters
    ----------
    ts : array-like
        Time-series for which the envelope should be computed.
    radius : int (default: 1)
        Radius to be used for the envelope generation (the envelope at time
        index i will be generated based on
        all observations from the time series at indices comprised between
        i-radius and i+radius).

    Returns
    -------
    array-like
        Lower-side of the envelope.
    array-like
        Upper-side of the envelope.

    Examples
    --------
    >>> ts1 = [1, 2, 3, 2, 1]
    >>> env_low, env_up = lb_envelope(ts1, radius=1)
    >>> env_low
    array([[1.],
           [1.],
           [2.],
           [1.],
           [1.]])
    >>> env_up
    array([[2.],
           [3.],
           [3.],
           [3.],
           [2.]])

    See also
    --------
    lb_keogh : Compute LB_Keogh similarity

    References
    ----------
    .. [1] Keogh, E. Exact indexing of dynamic time warping. In International
       Conference on Very Large Data Bases, 2002. pp 406-417.
    """
    return njit_lb_envelope(to_time_series(ts), radius=radius)
Beispiel #17
0
def lb_envelope(ts, radius=1):
    """Compute time-series envelope as required by LB_Keogh.

    LB_Keogh was originally presented in [1]_  and the multivariate extention in [2]_.

    Parameters
    ----------
    ts : array-like
        Time-series for which the envelope should be computed.
    radius : int (default: 1)
        Radius to be used for the envelope generation (the envelope at time index i will be generated based on
        all observations from the time series at indices comprised between i-radius and i+radius).

    Returns
    -------
    array-like
        Lower-side of the envelope.
    array-like
        Upper-side of the envelope.

    Examples
    --------
    >>> ts1 = [1, 2, 3, 2, 1]
    >>> env_low, env_up = lb_envelope(ts1, radius=1)
    >>> env_low
    array([[ 1.],
           [ 1.],
           [ 2.],
           [ 1.],
           [ 1.]])
    >>> env_up
    array([[ 2.],
           [ 3.],
           [ 3.],
           [ 3.],
           [ 2.]])

    See also
    --------
    lb_keogh : Compute LB_Keogh similarity

    References
    ----------
    .. [1] Keogh, E. Exact indexing of dynamic time warping. In International Conference on Very Large Data Bases, 2002.
       pp 406-417.
    .. [2] Rath, T. M., & Manmatha, R. Lower-bounding of dynamic time warping distances for multivariate time series. University of Massachusetts Amherst Technical Report MM, 40, 2002.
    """
    return cylb_envelope(to_time_series(ts), radius=radius)
Beispiel #18
0
def test_shapelets():
    pytest.importorskip('tensorflow')
    from tslearn.shapelets import ShapeletModel
    import tensorflow as tf

    n, sz, d = 15, 10, 2
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    y = rng.randint(2, size=n)
    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        optimizer="sgd",
                        random_state=0)

    cross_validate(clf, time_series, y, cv=2)

    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        optimizer=tf.optimizers.Adam(.1),
                        random_state=0)
    cross_validate(clf, time_series, y, cv=2)

    model = ShapeletModel(n_shapelets_per_size={3: 2, 4: 1}, max_iter=1)
    model.fit(time_series, y)
    for shp, shp_bis in zip(model.shapelets_,
                            model.shapelets_as_time_series_):
        np.testing.assert_allclose(shp,
                                   to_time_series(shp_bis, remove_nans=True))

    # Test set_weights / get_weights
    clf = ShapeletModel(n_shapelets_per_size={2: 5},
                        max_iter=1,
                        verbose=0,
                        random_state=0)
    clf.fit(time_series, y)
    preds_before = clf.predict_proba(time_series)
    weights = clf.get_weights()
    # Change number of iterations, then refit, then set weights
    clf.max_iter *= 2
    clf.fit(time_series, y)
    clf.set_weights(weights)
    np.testing.assert_allclose(preds_before,
                               clf.predict_proba(time_series))
    def _func(self, Z):
        # Compute objective value and grad at Z.

        Z = Z.reshape(self.barycenter_.shape)

        G = numpy.zeros_like(Z)

        obj = 0

        for i in range(len(self._X_fit)):
            D = SquaredEuclidean(
                Z, to_time_series(self._X_fit[i], remove_nans=True))
            sdtw = SoftDTW(D, gamma=self.gamma)
            value = sdtw.compute()
            E = sdtw.grad()
            G_tmp = D.jacobian_product(E)
            G += self.weights[i] * G_tmp
            obj += self.weights[i] * value

        return obj, G.ravel()
    def fit(self, X):
        """Register X as the reference series and interpolate it to get a series of size nsamples.

        Parameters
        ----------
        X : numpy.ndarray
            A time series.

        Returns
        -------
        DTWSampler
            self
        """
        X = to_time_series(X)

        end = first_non_finite_index(X)
        self.reference_series_ = _resampled(X[:end],
                                            n_samples=self.n_samples,
                                            kind=self.interp_kind)
        return self
Beispiel #21
0
def to_timeseries_set(df, var):
    ids = list(df['studyID'].unique())
    selected_ids = random.sample(ids, 10)
    print(len(ids))
    ts = []
    empty = 0
    count = 0
    for Id in selected_ids:
        count += 1
        sub_df, stop = select_data(df, Id, var)
        if len(sub_df.index) > 1:
            x = sub_df.loc[sub_df['value_name'] == variables[var]
                           ['Name']]['Reltime']
            y = sub_df.loc[sub_df['value_name'] == variables[var]
                           ['Name']]['value']
            _, y_pred, _ = get_gaussian(x, y, stop, var)
            y_pred = to_time_series(y_pred)
            ts.append(y_pred)

    dat = to_time_series_dataset(ts)
    print(dat.shape)
    print("nr ids: ", count)
    return dat, empty
Beispiel #22
0
def dtw_path_limited_warping_length(s1, s2, max_length):
    r"""Compute Dynamic Time Warping (DTW) similarity measure between
    (possibly multidimensional) time series under an upper bound constraint on
    the resulting path length and return the path as well as the similarity
    cost.

    DTW is computed as the Euclidean distance between aligned time series,
    i.e., if :math:`\pi` is the optimal alignment path:

    .. math::

        DTW(X, Y) = \sqrt{\sum_{(i, j) \in \pi} \|X_{i} - Y_{j}\|^2}

    Note that this formula is still valid for the multivariate case.

    It is not required that both time series share the same size, but they must
    be the same dimension. DTW was originally presented in [1]_.
    This constrained-length variant was introduced in [2]_.
    Both variants are
    discussed in more details in our :ref:`dedicated user-guide page <dtw>`

    Parameters
    ----------
    s1
        A time series.

    s2
        Another time series.

    max_length : int
        Maximum allowed warping path length.
        If greater than len(s1) + len(s2), then it is equivalent to
        unconstrained DTW.
        If lower than max(len(s1), len(s2)), no path can be found and a
        ValueError is raised.

    Returns
    -------
    list of integer pairs
        Optimal path

    float
        Similarity score

    Examples
    --------
    >>> path, cost = dtw_path_limited_warping_length([1, 2, 3],
    ...                                              [1., 2., 2., 3.], 5)
    >>> cost
    0.0
    >>> path
    [(0, 0), (1, 1), (1, 2), (2, 3)]
    >>> path, cost = dtw_path_limited_warping_length([1, 2, 3],
    ...                                              [1., 2., 2., 3., 4.], 5)
    >>> cost
    1.0
    >>> path
    [(0, 0), (1, 1), (1, 2), (2, 3), (2, 4)]

    See Also
    --------
    dtw_limited_warping_length : Get the similarity score for DTW with limited
        warping path length
    dtw_path : Get both the matching path and the similarity score for DTW

    References
    ----------
    .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for
           spoken word recognition," IEEE Transactions on Acoustics, Speech and
           Signal Processing, vol. 26(1), pp. 43--49, 1978.
    .. [2] Z. Zhang, R. Tavenard, A. Bailly, X. Tang, P. Tang, T. Corpetti
           Dynamic time warping under limited warping path length.
           Information Sciences, vol. 393, pp. 91--107, 2017.
    """
    s1 = to_time_series(s1, remove_nans=True)
    s2 = to_time_series(s2, remove_nans=True)

    if max_length < max(s1.shape[0], s2.shape[0]):
        raise ValueError("Cannot find a path of length {} to align given "
                         "time series.".format(max_length))

    accumulated_costs = _limited_warping_length_cost(s1, s2, max_length)
    idx_pair = (s1.shape[0] - 1, s2.shape[0] - 1)
    optimal_length = -1
    optimal_cost = numpy.inf
    for k, v in accumulated_costs[idx_pair].items():
        if v < optimal_cost:
            optimal_cost = v
            optimal_length = k
    path = _return_path_limited_warping_length(accumulated_costs,
                                               idx_pair,
                                               optimal_length)
    return path, numpy.sqrt(optimal_cost)
Beispiel #23
0
normalized_flow = scaler_flow_train.transform(flow)
normalized_flow
#from array to list
normalized_flow = normalized_flow.tolist()
len(normalized_flow)
from toolz.itertoolz import sliding_window, partition
#for every day of the train set store the flow observations
day_flow = list(partition(240, normalized_flow))
day_flow
len(day_flow)
#from list to multidimensional array
day_flow = np.asarray(day_flow)
day_flow
from tslearn.utils import to_time_series, to_time_series_dataset
#create univariate series for normalized flow_observation
first_time_series = to_time_series(day_flow)
print(first_time_series.shape)

#treatment of density variable
density = df.loc[:, 'Density']
#normalization/standardization of train data
density = np.array(density)
density = density.reshape((len(density), 1))
#fit train data
scaler_density_train = scaler.fit(density)
print('Min: %f, Max: %f' %
      (scaler_density_train.data_min_, scaler_density_train.data_max_))
#scale train data
normalized_density = scaler_density_train.transform(density)
normalized_density
#from array to list
Beispiel #24
0
                                          documents[["year", "month"]])
 # names of all years and months without duplicates
 months_grouped = documents[["year", "month"]].drop_duplicates()
 # making an DataFrame to store words in column names and dates in indexes.
 tfidf_monthly_dataframe = pd.DataFrame(
     tfidf_avg_monthly.toarray(),
     columns=vocabulary["word"],
     index=pd.to_datetime({
         "year": months_grouped.year,
         "month": months_grouped.month,
         "day": 1
     }),
 )
 # time series - each row(word) is one time serie
 # each time series is an array of 72 months.
 time_series = to_time_series(tfidf_monthly_dataframe.values.transpose())
 N_clusters = 7
 model = TimeSeriesKMeans(N_clusters)
 vocabulary["cluster"] = model.fit_predict(time_series)
 # mapping cluster numbers to colors
 colors = pd.DataFrame(pl.cm.jet(np.linspace(0, 1, N_clusters)))
 vocabulary.sort_values(["cluster", "relevance"],
                        inplace=True,
                        ascending=False)
 # getting the most relevant words for each topic
 topics = (vocabulary[["cluster", "word"]].groupby("cluster").agg({
     "word":
     lambda words: ", ".join(words[:15]),
 })).reset_index().rename({'Index': 'cluster'})
 clusters_centers = pd.DataFrame(
     model.cluster_centers_.reshape((N_clusters, -1)).transpose(),
from tslearn.utils import to_time_series
my_first_time_series = [1, 3, 4, 2]
formatted_time_series = to_time_series(my_first_time_series)
print(formatted_time_series.shape)
Beispiel #26
0
 def sklearn_metric(x, y):
     return metric(to_time_series(x.reshape((sz, d)),
                                  remove_nans=True),
                   to_time_series(y.reshape((sz, d)),
                                  remove_nans=True))
normalized_flow = scaler_flow_train.transform(flow)
normalized_flow
#from array to list
normalized_flow = normalized_flow.tolist()
len(normalized_flow)
from toolz.itertoolz import sliding_window, partition
#for every day of the train set store the flow observations
day_flow = list(partition(48, normalized_flow))
day_flow
len(day_flow)
#from list to multidimensional array
day_flow = np.asarray(day_flow)
day_flow
from tslearn.utils import to_time_series, to_time_series_dataset
#create univariate series for normalized flow_observation
first_time_series = to_time_series(day_flow)
print(first_time_series.shape)

#treatment of speed variable
speed = df.loc[:, 'Speed']
#normalization/standardization of train data
speed = np.array(speed)
speed = speed.reshape((len(speed), 1))
#fit train data
scaler_speed_train = scaler.fit(speed)
print('Min: %f, Max: %f' %
      (scaler_speed_train.data_min_, scaler_speed_train.data_max_))
#scale train data
normalized_speed = scaler_speed_train.transform(speed)
normalized_speed
#from array to list
Beispiel #28
0
def softdtw_barycenter(X,
                       gamma=1.0,
                       weights=None,
                       method="L-BFGS-B",
                       tol=1e-3,
                       max_iter=50,
                       init=None):
    """Compute barycenter (time series averaging) under the soft-DTW [1]
    geometry.

    Soft-DTW was originally presented in [1]_.

    Parameters
    ----------
    X : array-like, shape=(n_ts, sz, d)
        Time series dataset.
    gamma: float
        Regularization parameter.
        Lower is less smoothed (closer to true DTW).
    weights: None or array
        Weights of each X[i]. Must be the same size as len(X).
        If None, uniform weights are used.
    method: string
        Optimization method, passed to `scipy.optimize.minimize`.
        Default: L-BFGS.
    tol: float
        Tolerance of the method used.
    max_iter: int
        Maximum number of iterations.
    init: array or None (default: None)
        Initial barycenter to start from for the optimization process.
        If `None`, euclidean barycenter is used as a starting point.

    Returns
    -------
    numpy.array of shape (bsz, d) where `bsz` is the size of the `init` array \
            if provided or `sz` otherwise
        Soft-DTW barycenter of the provided time series dataset.

    Examples
    --------
    >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]]
    >>> softdtw_barycenter(time_series, max_iter=5)
    array([[1.25161574],
           [2.03821705],
           [3.5101956 ],
           [4.36140605]])
    >>> time_series = [[1, 2, 3, 4], [1, 2, 3, 4, 5]]
    >>> softdtw_barycenter(time_series, max_iter=5)
    array([[1.21349933],
           [1.8932251 ],
           [2.67573269],
           [3.51057026],
           [4.33645802]])

    References
    ----------
    .. [1] M. Cuturi, M. Blondel "Soft-DTW: a Differentiable Loss Function for
       Time-Series," ICML 2017.
    """
    X_ = to_time_series_dataset(X)
    weights = _set_weights(weights, X_.shape[0])
    if init is None:
        if check_equal_size(X_):
            barycenter = euclidean_barycenter(X_, weights)
        else:
            resampled_X = TimeSeriesResampler(sz=X_.shape[1]).fit_transform(X_)
            barycenter = euclidean_barycenter(resampled_X, weights)
    else:
        barycenter = init

    if max_iter > 0:
        X_ = numpy.array([to_time_series(d, remove_nans=True) for d in X_])

        def f(Z):
            return _softdtw_func(Z, X_, weights, barycenter, gamma)

        # The function works with vectors so we need to vectorize barycenter.
        res = minimize(f,
                       barycenter.ravel(),
                       method=method,
                       jac=True,
                       tol=tol,
                       options=dict(maxiter=max_iter, disp=False))
        return res.x.reshape(barycenter.shape)
    else:
        return barycenter
Beispiel #29
0
def softdtw_barycenter(X,
                       gamma=1.0,
                       weights=None,
                       method="L-BFGS-B",
                       tol=1e-3,
                       max_iter=50,
                       init=None):
    """Compute barycenter (time series averaging) under the soft-DTW geometry.

    Parameters
    ----------
    X : array-like, shape=(n_ts, sz, d)
        Time series dataset.
    gamma: float
        Regularization parameter.
        Lower is less smoothed (closer to true DTW).
    weights: None or array
        Weights of each X[i]. Must be the same size as len(X).
    method: string
        Optimization method, passed to `scipy.optimize.minimize`.
        Default: L-BFGS.
    tol: float
        Tolerance of the method used.
    max_iter: int
        Maximum number of iterations.
    init: array or None (default: None)
        Initial barycenter to start from for the optimization process.
        If `None`, euclidean barycenter is used as a starting point.

    Examples
    --------
    >>> time_series = [[1, 2, 3, 4], [1, 2, 4, 5]]
    >>> euc_bar = euclidean_barycenter(time_series)
    >>> stdw_bar = softdtw_barycenter(time_series, max_iter=0)
    >>> stdw_bar.shape
    (4, 1)
    >>> numpy.alltrue(numpy.abs(euc_bar - stdw_bar) < 1e-9)  # Because 0 iterations were performed
    True
    >>> softdtw_barycenter(time_series, max_iter=5).shape
    (4, 1)
    >>> time_series = [[1, 2, 3, 4], [1, 2, 3, 4, 5]]
    >>> softdtw_barycenter(time_series, max_iter=5).shape
    (5, 1)
    """
    X_ = to_time_series_dataset(X)
    weights = _set_weights(weights, X_.shape[0])
    if init is None:
        if check_equal_size(X_):
            barycenter = euclidean_barycenter(X_, weights)
        else:
            resampled_X = TimeSeriesResampler(sz=X_.shape[1]).fit_transform(X_)
            barycenter = euclidean_barycenter(resampled_X, weights)
    else:
        barycenter = init

    if max_iter > 0:
        X_ = numpy.array([to_time_series(d, remove_nans=True) for d in X_])
        f = lambda Z: _softdtw_func(Z, X_, weights, barycenter, gamma)
        # The function works with vectors so we need to vectorize barycenter.
        res = minimize(f,
                       barycenter.ravel(),
                       method=method,
                       jac=True,
                       tol=tol,
                       options=dict(maxiter=max_iter, disp=False))
        return res.x.reshape(barycenter.shape)
    else:
        return barycenter
Beispiel #30
0
def silhouette_score(X,
                     labels,
                     metric=None,
                     sample_size=None,
                     metric_params=None,
                     random_state=None,
                     **kwds):
    """Compute the mean Silhouette Coefficient of all samples (cf.  [1]_ and  [2]_).

    Read more in the `scikit-learn documentation
    <http://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient>`_.

    Parameters
    ----------
    X : array [n_ts, n_ts] if metric == "precomputed", or, \
             [n_ts, sz, d] otherwise
        Array of pairwise distances between time series, or a time series dataset.
    labels : array, shape = [n_ts]
         Predicted labels for each time series.
    metric : string, or callable
        The metric to use when calculating distance between time series.
        Should be one of {'dtw', 'softdtw', 'euclidean'} or a callable distance
        function.
        If X is the distance array itself, use ``metric="precomputed"``.
    sample_size : int or None
        The size of the sample to use when computing the Silhouette Coefficient
        on a random subset of the data.
        If ``sample_size is None``, no sampling is used.
    metric_params : dict or None
        Parameter values for the chosen metric. Value associated to the `"gamma_sdtw"` key corresponds to the gamma
        parameter in Soft-DTW.
    random_state : int, RandomState instance or None, optional (default=None)
        The generator used to randomly select a subset of samples.  If int,
        random_state is the seed used by the random number generator; If
        RandomState instance, random_state is the random number generator; If
        None, the random number generator is the RandomState instance used by
        `np.random`. Used when ``sample_size is not None``.
    **kwds : optional keyword parameters
        Any further parameters are passed directly to the distance function.
    Returns
    -------
    silhouette : float
        Mean Silhouette Coefficient for all samples.
    References
    ----------
    .. [1] `Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
       Interpretation and Validation of Cluster Analysis". Computational
       and Applied Mathematics 20: 53-65.
       <http://www.sciencedirect.com/science/article/pii/0377042787901257>`_
    .. [2] `Wikipedia entry on the Silhouette Coefficient
           <https://en.wikipedia.org/wiki/Silhouette_(clustering)>`_
    Examples
    --------
    >>> from tslearn.generators import random_walks
    >>> from tslearn.metrics import cdist_dtw
    >>> X = random_walks(n_ts=50, sz=32, d=1)
    >>> labels = numpy.random.randint(2, size=50)
    >>> s_sc = silhouette_score(X, labels, metric="dtw")
    >>> s_sc2 = silhouette_score(X, labels, metric="softdtw")
    >>> s_sc2b = silhouette_score(X, labels, metric="softdtw", metric_params={"gamma_sdtw": 2.})
    >>> s_sc3 = silhouette_score(cdist_dtw(X), labels, metric="precomputed")
    """
    sklearn_metric = None
    if metric_params is None:
        metric_params = {}
    if metric == "precomputed":
        sklearn_X = X
    elif metric == "dtw":
        sklearn_X = cdist_dtw(X)
    elif metric == "softdtw":
        gamma = metric_params.get("gamma_sdtw", None)
        if gamma is not None:
            sklearn_X = cdist_soft_dtw(X, gamma=gamma)
        else:
            sklearn_X = cdist_soft_dtw(X)
    elif metric == "euclidean":
        sklearn_X = cdist(X, X, metric="euclidean")
    else:
        X_ = to_time_series_dataset(X)
        n, sz, d = X_.shape
        sklearn_X = X_.reshape((n, -1))
        if metric is None:
            metric = dtw
        sklearn_metric = lambda x, y: metric(
            to_time_series(x.reshape((sz, d)), remove_nans=True),
            to_time_series(y.reshape((sz, d)), remove_nans=True))
    return sklearn_silhouette_score(
        X=sklearn_X,
        labels=labels,
        metric="precomputed" if sklearn_metric is None else sklearn_metric,
        sample_size=sample_size,
        random_state=random_state,
        **kwds)