def prepare(self, m, series, query=None):
        if series.ndim != 1:
            raise RuntimeError("Series should be 1D")
        if query is not None and query.ndim != 1:
            raise RuntimeError("Query should be 1D")

        series_buffer = RingBuffer(None,
                                   shape=series.shape,
                                   dtype=np.float,
                                   scaling_factor=1)
        if query is not None:
            query_buffer = RingBuffer(None,
                                      shape=query.shape,
                                      dtype=np.float,
                                      scaling_factor=1)
            self_join = False
        else:
            query_buffer = series_buffer
            self_join = True

        result = BoundZNormEuclidean(m, series_buffer, query_buffer, self_join,
                                     self.noise_std, 1)
        result.append_series(series)
        if not self_join:
            result.append_query(query)

        return result
Example #2
0
    def calc_column(self, column):
        if self.prev_calc_column_index != column - 1 or column == 0:
            # Previous column not cached or data for incremental calculation not available: full calculation
            sq_dist = _euclidean_distance_squared(self.query.view, self.series[column:column + self.m])
        else:
            # Previous column cached, reuse it
            if self.first_row is None:
                self.first_row = RingBuffer(_euclidean_distance_squared(self.series.view, self.query[0: self.m]),
                                            shape=(self.series.max_shape[0] - self.m + 1,))
                self.first_row_backlog = 0
            elif self.first_row_backlog > 0:
                # Series has been updated since last calculation of first_row
                elems_to_recalc = self.first_row_backlog + self.m - 1
                self.first_row.push(_euclidean_distance_squared(self.series[-elems_to_recalc:], self.query[0: self.m]))
                self.first_row_backlog = 0

            sq_dist = self.prev_calc_column_sq_dist  # work in same array
            sq_dist[1:] = (self.prev_calc_column_sq_dist[:-1]
                           - np.square(self.series[column - 1] - self.query[:len(self.query.view)-self.m])
                           + np.square(self.series[column + self.m - 1] - self.query[self.m:]))
            sq_dist[0] = self.first_row[column]

        self.prev_calc_column_sq_dist = sq_dist
        self.prev_calc_column_index = column

        return np.sqrt(sq_dist)
    def initialise(self, dims, query_subseq, series_subseq):
        super().initialise(dims, query_subseq, series_subseq)

        self._range = RingBuffer(self._range, scaling_factor=self._rb_scale_factor)

        self._matrix_profile_left = RingBuffer(self._matrix_profile_left, scaling_factor=self._rb_scale_factor)
        self._profile_index_left = RingBuffer(self._profile_index_left, scaling_factor=self._rb_scale_factor)
        self._matrix_profile_right = RingBuffer(self._matrix_profile_right, scaling_factor=self._rb_scale_factor)
        self._profile_index_right = RingBuffer(self._profile_index_right, scaling_factor=self._rb_scale_factor)
    def test_one_dimensional(self):
        buffer = RingBuffer([0, 1, 2, 3, 4])
        npt.assert_equal(buffer.view, np.array([0, 1, 2, 3, 4]))
        npt.assert_equal(buffer.max_shape, (5, ))

        self.assertFalse(buffer.push([]))
        npt.assert_equal(buffer.view, np.array([0, 1, 2, 3, 4]))
        self.assertEqual(buffer[0], 0)

        self.assertTrue(buffer.push(5))
        npt.assert_equal(buffer.view, np.array([1, 2, 3, 4, 5]))
        self.assertEqual(buffer[0], 1)

        self.assertTrue(buffer.push([6]))
        self.assertTrue(buffer.push([7]))
        npt.assert_equal(buffer.view, np.array([3, 4, 5, 6, 7]))
        self.assertEqual(buffer[0], 3)

        self.assertTrue(buffer.push([8, 9, 10]))
        npt.assert_equal(buffer.view, np.array([6, 7, 8, 9, 10]))
        self.assertEqual(buffer[0], 6)

        self.assertTrue(buffer.push([11, 12, 13, 14]))
        npt.assert_equal(buffer.view, np.array([10, 11, 12, 13, 14]))
        self.assertEqual(buffer[0], 10)

        self.assertTrue(buffer.push([15, 16, 17, 18, 19]))
        npt.assert_equal(buffer.view, np.array([15, 16, 17, 18, 19]))
        self.assertEqual(buffer[0], 15)

        self.assertTrue(buffer.push([20, 21, 22, 23, 24, 25]))
        npt.assert_equal(buffer.view, np.array([21, 22, 23, 24, 25]))
        self.assertEqual(buffer[0], 21)
Example #5
0
    def prepare_streaming(self, m, series_window, query_window=None):
        series = RingBuffer(None, (series_window,), dtype=np.float, scaling_factor=self._rb_scale_factor)

        if query_window is not None:
            query = RingBuffer(None, (query_window,), dtype=np.float, scaling_factor=self._rb_scale_factor)
            self_join = False
        else:
            query = series
            self_join = True

        return BoundStreamingEuclidean(m, series, query, self_join)
    def test_oversized_initialization(self):
        buffer = RingBuffer([1, 2, 3, 4, 5, 6], shape=(5, ), dtype=np.int)
        npt.assert_equal(buffer.max_shape, (5, ))

        npt.assert_equal(buffer.view, np.array([2, 3, 4, 5, 6]))
        self.assertEqual(buffer[0], 2)

        self.assertTrue(buffer.push([7]))
        npt.assert_equal(buffer.view, np.array([3, 4, 5, 6, 7]))
        self.assertEqual(buffer[0], 3)

        self.assertTrue(buffer.push([8, 9, 10]))
        npt.assert_equal(buffer.view, np.array([6, 7, 8, 9, 10]))
        self.assertEqual(buffer[0], 6)
    def test_partial_intialization(self):
        buffer = RingBuffer([1, 2], shape=(5, ), dtype=np.int)
        npt.assert_equal(buffer.max_shape, (5, ))

        npt.assert_equal(buffer.view, np.array([1, 2]))
        self.assertEqual(buffer[0], 1)

        self.assertFalse(buffer.push([3]))
        npt.assert_equal(buffer.view, np.array([1, 2, 3]))
        self.assertEqual(buffer[0], 1)

        self.assertTrue(buffer.push([4, 5, 6]))
        npt.assert_equal(buffer.view, np.array([2, 3, 4, 5, 6]))
        self.assertEqual(buffer[0], 2)
Example #8
0
    def prepare(self, m, series, query=None):
        if series.ndim != 1:
            raise RuntimeError("Series should be 1D")
        if query is not None and query.ndim != 1:
            raise RuntimeError("Query should be 1D")

        series = RingBuffer(series, dtype=np.float, scaling_factor=1)
        if query is not None:
            query = RingBuffer(query, dtype=np.float, scaling_factor=1)
            self_join = False
        else:
            query = series
            self_join = True
        return BoundStreamingEuclidean(m, series, query, self_join)
Example #9
0
    def prepare_streaming(self, m, series_window, query_window=None):
        series = RingBuffer(None, (series_window,), dtype=np.float, scaling_factor=self._rb_scale_factor)

        if query_window is not None:
            query = RingBuffer(None, (query_window,), dtype=np.float, scaling_factor=self._rb_scale_factor)
            self_join = False
        else:
            query = series
            self_join = True

        num_subseq_s = series.max_shape[-1] - m + 1
        mu_s = RingBuffer(None, shape=(num_subseq_s,), dtype=np.float, scaling_factor=self._rb_scale_factor)
        std_s = RingBuffer(None, shape=(num_subseq_s,), dtype=np.float, scaling_factor=self._rb_scale_factor)
        std_s_nonzero = RingBuffer(None, shape=(num_subseq_s,), dtype=np.bool, scaling_factor=self._rb_scale_factor)

        if not self_join:
            num_subseq_q = query.max_shape[-1] - m + 1
            mu_q = RingBuffer(None, shape=(num_subseq_q,), dtype=np.float, scaling_factor=self._rb_scale_factor)
            std_q = RingBuffer(None, shape=(num_subseq_q,), dtype=np.float, scaling_factor=self._rb_scale_factor)
            std_q_nonzero = RingBuffer(None, shape=(num_subseq_q,), dtype=np.bool, scaling_factor=self._rb_scale_factor)
        else:
            mu_q = mu_s
            std_q = std_s
            std_q_nonzero = std_s_nonzero

        return BoundZNormEuclidean(m, series, query, self_join, self.noise_std,
                                   mu_s, std_s, std_s_nonzero, mu_q, std_q, std_q_nonzero)
Example #10
0
    def prepare(self, m, series, query=None):
        if series.ndim != 1:
            raise RuntimeError("Series should be 1D")
        if query is not None and query.ndim != 1:
            raise RuntimeError("Query should be 1D")

        num_subseq_s = series.shape[-1] - m + 1
        series_buffer = RingBuffer(None, shape=series.shape, dtype=np.float, scaling_factor=1)
        mu_s = RingBuffer(None, shape=(num_subseq_s,), dtype=np.float, scaling_factor=1)
        std_s = RingBuffer(None, shape=(num_subseq_s,), dtype=np.float, scaling_factor=1)
        std_s_nonzero = RingBuffer(None, shape=(num_subseq_s,), dtype=np.bool , scaling_factor=1)

        if query is not None:
            num_subseq_q = query.shape[-1] - m + 1
            query_buffer = RingBuffer(None, shape=query.shape, dtype=np.float, scaling_factor=1)
            mu_q = RingBuffer(None, shape=(num_subseq_q,), dtype=np.float, scaling_factor=1)
            std_q = RingBuffer(None, shape=(num_subseq_q,), dtype=np.float, scaling_factor=1)
            std_q_nonzero = RingBuffer(None, shape=(num_subseq_q,), dtype=np.bool, scaling_factor=1)
            self_join = False
        else:
            query_buffer = series_buffer
            mu_q = mu_s
            std_q = std_s
            std_q_nonzero = std_s_nonzero
            self_join = True

        result = BoundZNormEuclidean(m, series_buffer, query_buffer, self_join, self.noise_std,
                                     mu_s, std_s, std_s_nonzero, mu_q, std_q, std_q_nonzero)

        result.append_series(series)
        if not self_join:
            result.append_query(query)

        return result
    def test_empty_intialization(self):
        buffer = RingBuffer(None, shape=(5, ), dtype=np.int)
        npt.assert_equal(buffer.max_shape, (5, ))

        npt.assert_equal(buffer.view, np.array([]))

        self.assertEqual(buffer.push([1]), 0)
        npt.assert_equal(buffer.view, np.array([1]))
        self.assertEqual(buffer[0], 1)

        self.assertEqual(buffer.push([2, 3]), 0)
        npt.assert_equal(buffer.view, np.array([1, 2, 3]))
        self.assertEqual(buffer[0], 1)

        self.assertEqual(buffer.push([4, 5, 6]), 1)
        npt.assert_equal(buffer.view, np.array([2, 3, 4, 5, 6]))
        self.assertEqual(buffer[0], 2)
    def test_multi_dimensional(self):
        buffer = RingBuffer([[0, 1, 2, 3, 4], [0, -1, -2, -3, -4]])
        npt.assert_equal(buffer.view,
                         np.array([[0, 1, 2, 3, 4], [0, -1, -2, -3, -4]]))
        npt.assert_equal(buffer.max_shape, (2, 5))

        self.assertEqual(buffer.push([[], []]), 0)
        npt.assert_equal(buffer.view,
                         np.array([[0, 1, 2, 3, 4], [0, -1, -2, -3, -4]]))
        npt.assert_equal(buffer[:, 0], [0, 0])

        self.assertEqual(buffer.push([[5], [-5]]), 1)
        npt.assert_equal(buffer.view,
                         np.array([[1, 2, 3, 4, 5], [-1, -2, -3, -4, -5]]))
        npt.assert_equal(buffer[:, 0], [1, -1])

        self.assertEqual(buffer.push([[6, 7], [-6, -7]]), 2)
        npt.assert_equal(buffer.view,
                         np.array([[3, 4, 5, 6, 7], [-3, -4, -5, -6, -7]]))
        npt.assert_equal(buffer[:, 0], [3, -3])

        self.assertEqual(buffer.push([[8, 9, 10], [-8, -9, -10]]), 3)
        npt.assert_equal(buffer.view,
                         np.array([[6, 7, 8, 9, 10], [-6, -7, -8, -9, -10]]))
        npt.assert_equal(buffer[:, 0], [6, -6])

        self.assertEqual(buffer.push([[11, 12, 13, 14], [-11, -12, -13, -14]]),
                         4)
        npt.assert_equal(
            buffer.view,
            np.array([[10, 11, 12, 13, 14], [-10, -11, -12, -13, -14]]))
        npt.assert_equal(buffer[:, 0], [10, -10])

        self.assertEqual(
            buffer.push([[15, 16, 17, 18, 19], [-15, -16, -17, -18, -19]]), 5)
        npt.assert_equal(
            buffer.view,
            np.array([[15, 16, 17, 18, 19], [-15, -16, -17, -18, -19]]))
        npt.assert_equal(buffer[:, 0], [15, -15])

        self.assertEqual(
            buffer.push([[20, 21, 22, 23, 24, 25],
                         [-20, -21, -22, -23, -24, -25]]), 6)
        npt.assert_equal(
            buffer.view,
            np.array([[21, 22, 23, 24, 25], [-21, -22, -23, -24, -25]]))
        npt.assert_equal(buffer[:, 0], [21, -21])
    def initialise(self, dims, query_subseq, series_subseq):
        self._num_series_subseq = series_subseq
        self._num_query_subseq = query_subseq
        self._range = np.arange(0,
                                max(series_subseq, query_subseq),
                                dtype=np.int)

        num_query_contexts, num_series_contexts = self._contexts.context_matrix_shape(
        )

        self._distance_matrix = RingBuffer(
            np.full((num_query_contexts, num_series_contexts),
                    np.Inf,
                    dtype=np.float),
            scaling_factor=self._rb_scale_factor)
        self._match_index_series = RingBuffer(
            np.full((num_query_contexts, num_series_contexts),
                    -1,
                    dtype=np.int),
            scaling_factor=self._rb_scale_factor)
        self._match_index_query = RingBuffer(
            np.full((num_query_contexts, num_series_contexts),
                    -1,
                    dtype=np.int),
            scaling_factor=self._rb_scale_factor)
    def __init__(self, series, m) -> None:
        """
        Creates a new instance. This instance will keep track of a data stream (with dimensions matching those of
        series) and a stream of moving mean and standard deviation using a window of length m.

        :param series: Starting data of the data stream
        :param m: window size for mean and variance
        """
        if m > series.shape[-1]:
            raise RuntimeError("M should be <= series.shape[-1].")

        self._data_buffer = RingBuffer(series)
        self._m = m

        sliding_avg, sliding_std = sliding_mean_std(series, m)
        self._mean_buffer = RingBuffer(sliding_avg)
        self._std_buffer = RingBuffer(sliding_std)
    def __init__(self, generator, m, num_s_subseq, num_q_subseq,
                 invalid_data_function, rb_scale_factor):
        """
        Creates a new generator by wrapping another generator.

        :param generator: the generator whose results and input data will be filtered
        :param invalid_data_function: optional - a function that takes in the original data (series or query) and
           subsequence length and returns a boolean array of the same size that has a True value for any invalid values.
           These values will be replaced by zeros before reaching the wrapped generator. Any distance values
           that were calculated using invalid data points will be positive infinite values.
        """

        self._invalid_data_function = invalid_data_function

        invalid_s_subseq_buffer = RingBuffer(None,
                                             shape=(num_s_subseq, ),
                                             dtype=np.bool,
                                             scaling_factor=rb_scale_factor)

        self.invalid_series = RingBuffer(None,
                                         shape=(num_s_subseq + m - 1, ),
                                         dtype=np.bool,
                                         scaling_factor=rb_scale_factor)

        if num_q_subseq is None:
            self.self_join = True
            invalid_q_subseq_buffer = invalid_s_subseq_buffer
            num_q_subseq = num_s_subseq
            self.invalid_query = self.invalid_series
        else:
            self.self_join = False

            invalid_q_subseq_buffer = RingBuffer(
                None,
                shape=(num_q_subseq, ),
                dtype=np.bool,
                scaling_factor=rb_scale_factor)
            self.invalid_query = RingBuffer(None,
                                            shape=(num_q_subseq + m - 1, ),
                                            dtype=np.bool,
                                            scaling_factor=rb_scale_factor)

        super().__init__(generator, m, num_q_subseq, invalid_s_subseq_buffer,
                         invalid_q_subseq_buffer)
class ContextualMatrixProfile(AbstractStreamingConsumer):
    """
    A consumer that constructs the contextual matrix profile. The contextual matrix profile is formed by
    taking the minimum of rectangles across the full distance matrix (where the matrix profile takes the
    minimum across columns).

    This consumer supports streaming if the provided context manager does.
    """
    def __init__(self,
                 context_manager: AbstractContextManager,
                 rb_scale_factor=2.):
        """
        Creates a new consumer that calculates a contextual matrix profile,
        according to the contexts defined by the manager.

        :param context_manager: object responsible for defining the spans of each context over the query and series axis
        :param rb_scale_factor: scaling factor used for RingBuffers in case of streaming data (should be >= 1),
            this allows choosing a balance between less memory (low values) and reduced data copying (higher values)
        """
        if rb_scale_factor < 1.:
            raise ValueError("rb_scale_factor should be >= 1, it was: " +
                             str(rb_scale_factor))

        self._num_series_subseq = None
        self._num_query_subseq = None
        self._range = None

        self._contexts = context_manager
        self._query_shift = 0
        self._series_shift = 0

        self._distance_matrix = None
        self._match_index_series = None
        self._match_index_query = None

        self._rb_scale_factor = rb_scale_factor

    def initialise(self, dims, query_subseq, series_subseq):
        self._num_series_subseq = series_subseq
        self._num_query_subseq = query_subseq
        self._range = np.arange(0,
                                max(series_subseq, query_subseq),
                                dtype=np.int)

        num_query_contexts, num_series_contexts = self._contexts.context_matrix_shape(
        )

        self._distance_matrix = RingBuffer(
            np.full((num_query_contexts, num_series_contexts),
                    np.Inf,
                    dtype=np.float),
            scaling_factor=self._rb_scale_factor)
        self._match_index_series = RingBuffer(
            np.full((num_query_contexts, num_series_contexts),
                    -1,
                    dtype=np.int),
            scaling_factor=self._rb_scale_factor)
        self._match_index_query = RingBuffer(
            np.full((num_query_contexts, num_series_contexts),
                    -1,
                    dtype=np.int),
            scaling_factor=self._rb_scale_factor)

    def process_diagonal(self, diag, values):
        values = values[0]
        num_values = len(values)

        if diag >= 0:
            values_idx1_start = diag
            context0_idxs = self._contexts.query_contexts(0, num_values)
        else:
            values_idx1_start = 0
            context0_idxs = self._contexts.query_contexts(
                -diag, self._num_query_subseq)

        for c0_start, c0_end, c0_identifier in context0_idxs:
            # We now have a sub-sequence (ss) defined by the first context on the query axis
            # In absolute coordinates, start/end of this subsequence on 2nd axis (series axis)
            ss1_start = min(max(0, c0_start + diag), self._num_series_subseq)
            ss1_end = min(self._num_series_subseq,
                          min(self._num_query_subseq, c0_end) + diag)

            if ss1_start == ss1_end:
                continue

            context1_idxs = self._contexts.series_contexts(ss1_start, ss1_end)

            for c1_start, c1_end, c1_identifier in context1_idxs:
                # In absolute coordinates, start/end of the subsequence on 2nd axis defined by both contexts
                sss1_start = max(ss1_start, c1_start)
                sss1_end = min(ss1_end, c1_end)

                # Values that belong to both contexts
                sss_values = values[sss1_start - values_idx1_start:sss1_end -
                                    values_idx1_start]

                # Compare if better than current
                min_sss_value = np.min(sss_values)
                is_better = min_sss_value < self._distance_matrix[
                    c0_identifier, c1_identifier]

                if is_better:
                    self._distance_matrix[c0_identifier,
                                          c1_identifier] = min_sss_value
                    rel_indices = np.argmin(sss_values)
                    sss0_start = sss1_start - diag
                    self._match_index_query[
                        c0_identifier,
                        c1_identifier] = rel_indices + sss0_start + self._query_shift
                    self._match_index_series[
                        c0_identifier,
                        c1_identifier] = rel_indices + sss1_start + self._series_shift

    def process_column(self, column_index, values):
        values = values[0]
        context1_idxs = self._contexts.series_contexts(column_index,
                                                       column_index + 1)

        for _, _, c1_identifier in context1_idxs:
            query_contexts = self._contexts.query_contexts(
                0, self._num_query_subseq)

            for c0_start, c0_end, c0_identifier in query_contexts:
                subseq = values[c0_start:c0_end]
                best_value = np.min(subseq)

                if best_value < self._distance_matrix[c0_identifier,
                                                      c1_identifier]:
                    self._distance_matrix[c0_identifier,
                                          c1_identifier] = best_value
                    self._match_index_query[
                        c0_identifier, c1_identifier] = np.argmin(
                            subseq) + c0_start + self._query_shift
                    self._match_index_series[
                        c0_identifier,
                        c1_identifier] = column_index + self._series_shift

    def shift_series(self, amount):
        context_shift = self._contexts.shift_series(amount)
        self._series_shift += amount

        if context_shift > 0:
            height = self._distance_matrix.max_shape[0]
            self._distance_matrix.push(
                np.full((height, context_shift), np.Inf, dtype=np.float))
            self._match_index_series.push(
                np.full((height, context_shift), -1, dtype=np.int))
            self._match_index_query.push(
                np.full((height, context_shift), -1, dtype=np.int))

    def shift_query(self, amount):
        context_shift = self._contexts.shift_query(amount)
        self._query_shift += amount

        if context_shift > 0:
            # Note: This could be more efficient using a 2D Ringbuffer.
            height = min(context_shift, self._distance_matrix.max_shape[0])
            self._distance_matrix.view = np.roll(self._distance_matrix.view,
                                                 context_shift,
                                                 axis=0)
            self._distance_matrix[-height:, :] = np.Inf
            self._match_index_series.view = np.roll(
                self._match_index_series.view, context_shift, axis=0)
            self._match_index_series[-height:, :] = -1
            self._match_index_query.view = np.roll(
                self._match_index_query.view, context_shift, axis=0)
            self._match_index_query[-height:, :] = -1

    @property
    def match_index_query(self):
        return self._match_index_query.view

    @property
    def match_index_series(self):
        return self._match_index_series.view

    @property
    def distance_matrix(self):
        return self._distance_matrix.view
Example #17
0
class MultidimensionalMatrixProfileLR(AbstractStreamingConsumer):
    """
    A consumer that builds the multidimensional matrix profile. This consumer takes in distance measures from
    multiple channels (dimensions) at the same time and tracks the best distance, the index of this match and
    the dimensions used in this match.
    More specifically, if the input has N data channels, this consumer will select for each number of channels
    (1, 2, ..., N), the channels containing the best match, index and dimensions. It will not track matches for
    any possible combination of channels.

    This consumer keeps track of the left and right multidimensional profile, and can be used to create the
    (normal) multidimensional profile from it. The left profile, index and dimensions
    at index i contain information about a match whose index is less than or equal to i, while the right
    profile, index and dimensions track information about a match whose index is larger than i.

    The profile is an array with shape (num_dimensions, num_distances). The value at row i, j contains the best averaged
    distances encountered at index j for any i+1 dimensions. The index is similar, but tracks the index of the query
    series that had the best match.

    The dimensions being tracked is a list of length num_dimensions. Entry i of this list contains an
    (i+1, num_distances) array that lists the indices of the dimensions that contained the best match.

    This consumer supports streaming.
    """
    def __init__(self, rb_scale_factor=2.):
        """
        Creates a new consumer that calculates the left and right matrix profile, the corresponding
        indices and the used dimensions over multiple dimensions (data channels).

        :param rb_scale_factor: scaling factor used for RingBuffers in case of streaming data (should be >= 1),
            this allows choosing a balance between less memory (low values) and reduced data copying (higher values)
        """

        if rb_scale_factor < 1.:
            raise ValueError("rb_scale_factor should be >= 1, it was: " +
                             str(rb_scale_factor))

        self._num_subseq = None
        self._range = None
        self._n_dim = None

        self._md_matrix_profile_left = None
        self._md_profile_index_left = None
        self._md_profile_dimension_left = None

        self._md_matrix_profile_right = None
        self._md_profile_index_right = None
        self._md_profile_dimension_right = None

        self._series_shift = 0
        self._query_shift = 0

        self._rb_scale_factor = rb_scale_factor

    def initialise(self, dims, query_subseq, series_subseq):
        self._n_dim = dims
        self._num_subseq = series_subseq
        self._range = RingBuffer(np.arange(0, self._num_subseq, dtype=np.int),
                                 scaling_factor=self._rb_scale_factor)

        self._md_matrix_profile_left = RingBuffer(
            np.full((dims, self._num_subseq), np.inf, dtype=np.float),
            scaling_factor=self._rb_scale_factor)
        self._md_profile_index_left = RingBuffer(
            np.full((dims, self._num_subseq), -1, dtype=np.int),
            scaling_factor=self._rb_scale_factor)
        self._md_profile_dimension_left = \
            [RingBuffer(np.full((i + 1, self._num_subseq), -1, dtype=np.int),
                        scaling_factor=self._rb_scale_factor) for i in range(dims)]

        self._md_matrix_profile_right = RingBuffer(
            np.full((dims, self._num_subseq), np.inf, dtype=np.float),
            scaling_factor=self._rb_scale_factor)
        self._md_profile_index_right = RingBuffer(
            np.full((dims, self._num_subseq), -1, dtype=np.int),
            scaling_factor=self._rb_scale_factor)
        self._md_profile_dimension_right = \
            [RingBuffer(np.full((i + 1, self._num_subseq), -1, dtype=np.int),
                        scaling_factor=self._rb_scale_factor) for i in range(dims)]

    def process_diagonal(self, diag, values):
        n_dim, num_values = values.shape
        shift_diff = self._series_shift - self._query_shift

        values_sort_order = np.argsort(values, axis=0)
        values_sorted = np.sort(values, axis=0)
        values_cumsum = np.zeros(num_values)

        if diag + shift_diff >= 0:
            # left MP
            if diag >= 0:
                for dim in range(n_dim):
                    values_cumsum += values_sorted[dim, :]
                    values_mean_over_dim = values_cumsum / (dim + 1)

                    self._update_matrix_profile(
                        values_mean_over_dim, self._range[:num_values],
                        values_sort_order[:dim + 1, :],
                        self._md_matrix_profile_left[dim,
                                                     diag:diag + num_values],
                        self._md_profile_index_left[dim,
                                                    diag:diag + num_values],
                        self._md_profile_dimension_left[dim][:, diag:diag +
                                                             num_values])
            else:
                for dim in range(n_dim):
                    values_cumsum += values_sorted[dim, :]
                    values_mean_over_dim = values_cumsum / (dim + 1)

                    self._update_matrix_profile(
                        values_mean_over_dim,
                        self._range[-diag:-diag + num_values],
                        values_sort_order[:dim + 1, :],
                        self._md_matrix_profile_left[dim, :num_values],
                        self._md_profile_index_left[dim, :num_values],
                        self._md_profile_dimension_left[dim][:, :num_values])
        else:
            # right MP
            if diag >= 0:
                for dim in range(n_dim):
                    values_cumsum += values_sorted[dim, :]
                    values_mean_over_dim = values_cumsum / (dim + 1)

                    self._update_matrix_profile(
                        values_mean_over_dim, self._range[num_values],
                        values_sort_order[:dim + 1, :],
                        self._md_matrix_profile_right[dim,
                                                      diag:diag + num_values],
                        self._md_profile_index_right[dim,
                                                     diag:diag + num_values],
                        self._md_profile_dimension_right[dim][:, diag:diag +
                                                              num_values])
            else:
                for dim in range(n_dim):
                    values_cumsum += values_sorted[dim, :]
                    values_mean_over_dim = values_cumsum / (dim + 1)

                    self._update_matrix_profile(
                        values_mean_over_dim,
                        self._range[-diag:-diag + num_values],
                        values_sort_order[:dim + 1, :],
                        self._md_matrix_profile_right[dim, :num_values],
                        self._md_profile_index_right[dim, :num_values],
                        self._md_profile_dimension_right[dim][:, :num_values])

        if diag >= 0:
            for dim in range(n_dim):
                values_cumsum += values_sorted[dim, :]
                values_mean_over_dim = values_cumsum / (dim + 1)

                self._update_matrix_profile(
                    values_mean_over_dim, self._range[:num_values],
                    values_sort_order[:dim + 1, :],
                    self._md_matrix_profile_left[dim, diag:diag + num_values],
                    self._md_profile_index_left[dim, diag:diag + num_values],
                    self._md_profile_dimension_left[dim][:, diag:diag +
                                                         num_values])

        else:
            for dim in range(n_dim):
                values_cumsum += values_sorted[dim, :]
                values_mean_over_dim = values_cumsum / (dim + 1)

                self._update_matrix_profile(
                    values_mean_over_dim,
                    self._range[-diag:-diag + num_values],
                    values_sort_order[:dim + 1, :],
                    self._md_matrix_profile_right[dim, :num_values],
                    self._md_profile_index_right[dim, :num_values],
                    self._md_profile_dimension_right[dim][:, :num_values])

    def _update_matrix_profile(self, new_distances, new_distance_indices,
                               new_distance_dimensions, matrix_profile,
                               matrix_profile_index, matrix_profile_dims):
        update_pos = new_distances < matrix_profile
        matrix_profile[update_pos] = new_distances[update_pos]
        matrix_profile_index[update_pos] = new_distance_indices[update_pos]
        matrix_profile_dims[:,
                            update_pos] = new_distance_dimensions[:,
                                                                  update_pos]

    def process_column(self, column_index, values):
        n_dim, num_values = values.shape
        shift_diff = self._series_shift - self._query_shift

        border = max(0, column_index + 1 + shift_diff)

        values_sorted = np.sort(values, axis=0)
        values_cumsum = np.zeros(num_values)

        for dim in range(n_dim):
            values_cumsum += values_sorted[dim, :]

            if border > 0:
                min_position_l = np.argmin(values_cumsum[:border])
                new_min_value = values_cumsum[min_position_l] / (dim + 1)

                if new_min_value < self._md_matrix_profile_left[dim,
                                                                column_index]:
                    self._md_matrix_profile_left[dim,
                                                 column_index] = new_min_value
                    self._md_profile_index_left[
                        dim, column_index] = min_position_l + self._query_shift
                    self._md_profile_dimension_left[dim][:, column_index] =\
                        np.argsort(values[:, min_position_l])[:dim + 1]

            # Check if column crosses into the lower triangle of the distance matrix
            if num_values > border:
                min_position_r = np.argmin(values_cumsum[border:]) + border
                new_min_value = values_cumsum[min_position_r] / (dim + 1)

                # In case of shifting, a lower value could already be present
                if new_min_value < self._md_matrix_profile_right[dim,
                                                                 column_index]:
                    self._md_matrix_profile_right[dim,
                                                  column_index] = new_min_value
                    self._md_profile_index_right[
                        dim, column_index] = min_position_r + self._query_shift
                    self._md_profile_dimension_right[dim][:, column_index] =\
                        np.argsort(values[:, min_position_r])[:dim + 1]

    def shift_query(self, amount):
        if amount == 0:
            return

        self._query_shift += amount
        self._range.push(
            np.arange(self._range[-1] + 1, self._range[-1] + 1 + amount))

    def shift_series(self, amount):
        if amount == 0:
            return

        self._series_shift += amount

        push_values = np.full((self._n_dim, amount), np.inf)
        self._md_matrix_profile_left.push(push_values)
        self._md_matrix_profile_right.push(push_values)

        push_values[:] = -1
        self._md_profile_index_left.push(push_values)
        self._md_profile_index_right.push(push_values)

        for dim in range(self._n_dim):
            self._md_profile_dimension_left[dim].push(push_values[:dim + 1, :])
            self._md_profile_dimension_right[dim].push(push_values[:dim +
                                                                   1, :])

    def md_matrix_profile(self):
        """
        Merges the left and right multidimensional matrix profile, to create the multidimensional matrix profile.
        :return: ndarray of shape (num_dimensions, num_subsequences)
        """
        left_best = self._md_matrix_profile_left.view < self._md_matrix_profile_right.view
        return np.where(left_best, self._md_matrix_profile_left.view,
                        self._md_matrix_profile_right.view)

    def md_profile_index(self):
        """
        Merges the left and right multidimensional matrix profile index, to create the multidimensional matrix profile
        index.
        :return: ndarray of shape (num_dimensions, num_subsequences)
        """
        left_best = self._md_matrix_profile_left.view < self._md_matrix_profile_right.view
        return np.where(left_best, self._md_profile_index_left.view,
                        self._md_profile_index_right.view)

    def md_profile_dimensions(self):
        """
        Merges the left and right dimensions, to create the dimensions for the multidimensional matrix profile.
        :return: list of length num_dimensions, where the entry at index i is an ndarray of shape
        (i+1, num_subsequences).
        """
        profile_dimension = [
            np.full((i + 1, self._num_subseq), -1, dtype=np.int)
            for i in range(self._n_dim)
        ]

        for dim in range(self._n_dim):
            left_best = self._md_matrix_profile_left[
                dim, :] < self._md_matrix_profile_right[dim, :]
            profile_dimension[dim] = np.where(
                left_best, self._md_profile_dimension_left[dim].view,
                self._md_profile_dimension_right[dim].view)

        return profile_dimension

    @property
    def md_matrix_profile_left(self):
        return self._md_matrix_profile_left.view

    @property
    def md_matrix_profile_right(self):
        return self._md_matrix_profile_right.view

    @property
    def md_profile_index_left(self):
        return self._md_profile_index_left.view

    @property
    def md_profile_index_right(self):
        return self._md_profile_index_right.view

    @property
    def md_profile_dimension_left(self):
        return [buffer.view for buffer in self._md_profile_dimension_left]

    @property
    def md_profile_dimension_right(self):
        return [buffer.view for buffer in self._md_profile_dimension_right]
Example #18
0
    def initialise(self, dims, query_subseq, series_subseq):
        self._n_dim = dims
        self._num_subseq = series_subseq
        self._range = RingBuffer(np.arange(0, self._num_subseq, dtype=np.int),
                                 scaling_factor=self._rb_scale_factor)

        self._md_matrix_profile_left = RingBuffer(
            np.full((dims, self._num_subseq), np.inf, dtype=np.float),
            scaling_factor=self._rb_scale_factor)
        self._md_profile_index_left = RingBuffer(
            np.full((dims, self._num_subseq), -1, dtype=np.int),
            scaling_factor=self._rb_scale_factor)
        self._md_profile_dimension_left = \
            [RingBuffer(np.full((i + 1, self._num_subseq), -1, dtype=np.int),
                        scaling_factor=self._rb_scale_factor) for i in range(dims)]

        self._md_matrix_profile_right = RingBuffer(
            np.full((dims, self._num_subseq), np.inf, dtype=np.float),
            scaling_factor=self._rb_scale_factor)
        self._md_profile_index_right = RingBuffer(
            np.full((dims, self._num_subseq), -1, dtype=np.int),
            scaling_factor=self._rb_scale_factor)
        self._md_profile_dimension_right = \
            [RingBuffer(np.full((i + 1, self._num_subseq), -1, dtype=np.int),
                        scaling_factor=self._rb_scale_factor) for i in range(dims)]
class StreamingStats(object):
    """
    Class that tracks a data stream and corresponding mean and standard deviation of a window over this data.

    The data stream has to be updated by the user, after which the mean/std stream will be updated automatically.

    This class uses RingBuffers internally, so any old view (data, mean, std) should be considered unreliable
    after new data was pushed to this class.
    """
    def __init__(self, series, m) -> None:
        """
        Creates a new instance. This instance will keep track of a data stream (with dimensions matching those of
        series) and a stream of moving mean and standard deviation using a window of length m.

        :param series: Starting data of the data stream
        :param m: window size for mean and variance
        """
        if m > series.shape[-1]:
            raise RuntimeError("M should be <= series.shape[-1].")

        self._data_buffer = RingBuffer(series)
        self._m = m

        sliding_avg, sliding_std = sliding_mean_std(series, m)
        self._mean_buffer = RingBuffer(sliding_avg)
        self._std_buffer = RingBuffer(sliding_std)

    def append(self, data):
        data_length = data.shape[-1]

        if data_length == 0:
            return

        self._data_buffer.push(data)
        new_means, new_stds = sliding_mean_std(
            self._data_buffer[max(-self._m - 1 - data_length, 0):], self._m)
        self._mean_buffer.push(new_means)
        self._std_buffer.push(new_stds)

        # Original implementation below, this approach might still be interesting if the current approach proves to be
        # too slow in practice. One issue that remains to be solved (why this method was replaced) is that
        # a mid-signal constant window will not result in variance of 0. One approach might be to simply check
        # for constant signals. A starting point might be:
        # https://stackoverflow.com/questions/1066758/find-length-of-sequences-of-identical-values-in-a-numpy-array-run-length-encodi?rq=1
        # The numerical stability test gives a use case where this method fails.
        #
        # buffer_length = self._data_buffer.view.shape[-1]
        # if data_length >= buffer_length:
        #     sliding_avg, sliding_var = sliding_mean_var(data[..., -buffer_length:], self._m)
        #     self._mean_buffer.push(sliding_avg)
        #     self._var_buffer.push(sliding_var)
        # else:
        #     # Sliding variance formula: http://jonisalonen.com/2014/efficient-and-accurate-rolling-standard-deviation/
        #     # First steps of derivation: http://jonisalonen.com/2013/deriving-welfords-method-for-computing-variance/
        #     # (For non-online calculation, the formula used in sliding_mean_var is faster)
        #
        #     old_mean = self._mean_buffer.view[..., -1]
        #     old_var = self._var_buffer.view[..., -1]
        #     values_to_remove = self._data_buffer.view[..., -self._m: min(-1, -self._m + data_length)]
        #     values_to_add = data[..., :values_to_remove.shape[-1]]
        #     new_means = old_mean + np.cumsum(- values_to_remove + values_to_add) / self._m
        #     old_means = np.concatenate((np.atleast_1d(old_mean), new_means[..., :-1]))
        #     new_vars = old_var + np.cumsum((values_to_add - values_to_remove) * (
        #        values_to_add - new_means + values_to_remove - old_means) / self._m)
        #     new_vars[new_vars < 1e-12] = 0.  # Unreliable!
        #
        #     self._mean_buffer.push(new_means)
        #     self._var_buffer.push(new_vars)
        #
        #     if data_length >= self._m:
        #         sliding_avg, sliding_var = sliding_mean_var(data, self._m)
        #         self._mean_buffer.push(sliding_avg)
        #         self._var_buffer.push(sliding_var)
        #
        # self._data_buffer.push(data)

    @property
    def data(self):
        return self._data_buffer.view

    @property
    def mean(self):
        return self._mean_buffer.view

    @property
    def std(self):
        return self._std_buffer.view
Example #20
0
class BoundStreamingEuclidean(AbstractBoundStreamingGenerator):
    def __init__(self, m, series, query, self_join):
        self.m = m
        self.series = series
        self.query = query
        self.self_join = self_join

        self.first_row = None
        self.first_row_backlog = 0  # The number of values not yet processed for the first row cache
        self.prev_calc_column_index = None
        self.prev_calc_column_sq_dist = None

    def append_series(self, values):
        if len(values) == 0:
            return

        data_dropped = self.series.push(values)
        num_dropped = len(values) - (self.series.max_shape[0] - self.series.view.shape[0])
        self.first_row_backlog += len(values)

        if self.prev_calc_column_index is not None and num_dropped > 0:
            self.prev_calc_column_index -= num_dropped

        if self.self_join:
            if data_dropped:
                self.first_row = None  # The first row was dropped by new data
            self.prev_calc_column_index = None

    def append_query(self, values):
        if self.self_join:
            raise RuntimeError("Cannot append query data in case of a self join.")

        if len(values) == 0:
            return

        if self.query.push(values):
            self.first_row = None  # The first row was dropped by new data
        self.prev_calc_column_index = None

    def calc_diagonal(self, diag):
        dl = diag_length(len(self.query.view), len(self.series.view), diag)
        cumsum = np.zeros(dl + 1, dtype=np.float)

        if diag >= 0:
            # Eg: for diag = 2:
            # D = (y0 - x2)², (y1 - x3)², (y2 - x4)²...
            # cumsum = 0, D0, D0+D1, D0+D1+D2, ...
            cumsum[1:] = np.cumsum(np.square(self.query[:dl] - self.series[diag: diag + dl]))
        else:
            # Eg: for diag = -2:
            # D = (y2 - x0)², (y3 - x1)², (y4 - x2)²...
            # cumsum = 0, D0, D0+D1, D0+D1+D2, ...
            cumsum[1:] = np.cumsum(np.square(self.query[-diag: -diag + dl] - self.series[:dl]))

        return np.sqrt(cumsum[self.m:] - cumsum[:len(cumsum) - self.m])

    def calc_column(self, column):
        if self.prev_calc_column_index != column - 1 or column == 0:
            # Previous column not cached or data for incremental calculation not available: full calculation
            sq_dist = _euclidean_distance_squared(self.query.view, self.series[column:column + self.m])
        else:
            # Previous column cached, reuse it
            if self.first_row is None:
                self.first_row = RingBuffer(_euclidean_distance_squared(self.series.view, self.query[0: self.m]),
                                            shape=(self.series.max_shape[0] - self.m + 1,))
                self.first_row_backlog = 0
            elif self.first_row_backlog > 0:
                # Series has been updated since last calculation of first_row
                elems_to_recalc = self.first_row_backlog + self.m - 1
                self.first_row.push(_euclidean_distance_squared(self.series[-elems_to_recalc:], self.query[0: self.m]))
                self.first_row_backlog = 0

            sq_dist = self.prev_calc_column_sq_dist  # work in same array
            sq_dist[1:] = (self.prev_calc_column_sq_dist[:-1]
                           - np.square(self.series[column - 1] - self.query[:len(self.query.view)-self.m])
                           + np.square(self.series[column + self.m - 1] - self.query[self.m:]))
            sq_dist[0] = self.first_row[column]

        self.prev_calc_column_sq_dist = sq_dist
        self.prev_calc_column_index = column

        return np.sqrt(sq_dist)
Example #21
0
def find_consensus_motif(series_list, m: int) -> CMResult:
    """
    Finds the top-1 consensus motif and corresponding distance for the given collection of series.
    The consensus motif is the subsequence (extracted from one of the series),
    that has a match to a subsequence from each other series within a certain distance,
    where that distance is minimal.

    This method implements the Ostinato algorithm, described in
    "Matrix Profile XV: Exploiting Time Series Consensus  Motifs to Find Structure in Time Series Sets"
    by K. Kamgar, S. Gharghabi and E. Keogh.

    :param series_list: list of 1-dimensional arrays
    :param m: length of the consensus motif
    :return: tuple containing radius, series index and subsequence index of the consensus motif
    """
    if len(series_list) < 2:
        raise RuntimeError("At least 2 series are required.")
    if m < 3:
        raise RuntimeError("Motif length should be >= 3.")
    for series in series_list:
        series = np.array(series)
        if len(series) < m:
            raise RuntimeError(
                "One or more series are shorter than the desired motif length."
            )
        if series.ndim != 1:
            raise RuntimeError("One or more series are not one dimensional.")

    best_result = CMResult(np.inf, -1, -1)
    num_series = len(series_list)

    # Create a distance calculator for each series pair, but reuse mu/std calculations per series.
    # Step 1: mu/std calculation
    cached_generators = {}
    mus = []
    stds = []
    stdsz = []
    for series in series_list:
        mu, std = sliding_mean_std(series, m)
        mus.append(mu)
        stds.append(std)
        stdsz.append(std != 0.)

    # Step 2: create the distance calculator
    for i, series1 in enumerate(series_list):
        for j, series2 in enumerate(series_list):
            if i == j:
                continue
            gen = BoundZNormEuclidean(m, RingBuffer(series1,
                                                    scaling_factor=1.),
                                      RingBuffer(series2, scaling_factor=1.),
                                      False, 0., mus[i], stds[i], stdsz[i],
                                      mus[j], stds[j], stdsz[j])
            cached_generators[i, j] = gen

    # Look for the consensus motif: iterator over all series
    for series_idx in range(num_series):
        next_series_idx = (series_idx + 1) % num_series
        active_series = series_list[series_idx]

        # Calculate a full matrix profile between the series and the next series
        dist_calc = cached_generators[(series_idx, next_series_idx)]
        num_subseq = len(active_series) - m + 1
        mp = np.empty(num_subseq, dtype=np.float)
        for col in range(num_subseq):
            mp[col] = np.min(dist_calc.calc_column(col))

        # Order the subsequences of the series from lowest to highest distances (as given by the Matrix Profile)
        candidates = np.argsort(mp)

        # Iterate over all candidate subsequences, starting from those that had the best match to next_series.
        for subseq_idx in candidates:
            candidate_radius = mp[subseq_idx]
            aborted = False

            # Abort if the distance (to next_series) is worse than best result so far
            if candidate_radius >= best_result.radius:
                break

            # Check distance of the candidate subsequence to all other series.
            for other_series_idx in range(num_series):
                # Skip the current and next_series, as we already considered those.
                if other_series_idx in [series_idx, next_series_idx]:
                    continue

                # Calculates the distance from the candidate subsequence to all subsequences in other_series.
                other_gen = cached_generators[(series_idx, other_series_idx)]
                distances = other_gen.calc_column(subseq_idx)
                min_distance = np.min(distances)
                candidate_radius = max(candidate_radius, min_distance)

                # Abort search if distance is greater than best so far.
                if candidate_radius >= best_result.radius:
                    aborted = True
                    break

            # Store the current candidate as best result so far.
            if not aborted and candidate_radius < best_result.radius:
                best_result = CMResult(candidate_radius, series_idx,
                                       subseq_idx)

    return best_result
class ShiftingMatrixProfileLR(MatrixProfileLR, AbstractStreamingConsumer):
    """
    Extension of MatrixProfileLR which supports streaming.

    The profile indices tracked in this consumer refer to positions in the complete query series.
    As an example, if the original query consisted of 10 sequences, but has since shifted by 5 sequences,
    the profile indices can contain any value in [0..15], or -1 if no matrix profile value exists.
    These indices can be converted to indices local to the current window by subtracting the query_shift,
    keep in mind that some indices of the left matrix profile can refer to positions outside the current window.

    The concept of left and right matrix profile is only useful when both query and series shift at the same time
    (distances are calculated over a self-join). Even if this is not the case, the values in this consumer are
    correct: the left matrix profile stores any values on or above the (unshifted) main diagonal, the right
    matrix profile stores any values below the (unshifted) main diagonal. (Since the diagonal shifts away when
    only the series is shifted, eventually only the left matrix profile will be used.)
    """

    def __init__(self, rb_scale_factor=2.):
        """
        Creates a new instance.

        :param rb_scale_factor: scaling factor used for RingBuffers in case of streaming data (should be >= 1),
            this allows choosing a balance between less memory (low values) and reduced data copying (higher values)
        """
        if rb_scale_factor < 1.:
            raise ValueError("rb_scale_factor should be >= 1, it was: " + str(rb_scale_factor))

        super().__init__()
        self.series_shift = 0
        self.query_shift = 0
        self._rb_scale_factor = rb_scale_factor

    def initialise(self, dims, query_subseq, series_subseq):
        super().initialise(dims, query_subseq, series_subseq)

        self._range = RingBuffer(self._range, scaling_factor=self._rb_scale_factor)

        self._matrix_profile_left = RingBuffer(self._matrix_profile_left, scaling_factor=self._rb_scale_factor)
        self._profile_index_left = RingBuffer(self._profile_index_left, scaling_factor=self._rb_scale_factor)
        self._matrix_profile_right = RingBuffer(self._matrix_profile_right, scaling_factor=self._rb_scale_factor)
        self._profile_index_right = RingBuffer(self._profile_index_right, scaling_factor=self._rb_scale_factor)

    def process_diagonal(self, diag, values):
        values = values[0]
        num_values = len(values)
        shift_diff = self.series_shift - self.query_shift

        if diag + shift_diff >= 0:
            # left MP
            if diag >= 0:
                self._update_matrix_profile(
                    values,
                    self._range[:num_values],
                    self._matrix_profile_left[diag:diag + num_values],
                    self._profile_index_left[diag:diag + num_values])
            else:
                self._update_matrix_profile(
                    values,
                    self._range[-diag:-diag + num_values],
                    self._matrix_profile_left[:num_values],
                    self._profile_index_left[:num_values])
        else:
            # right MP
            if diag >= 0:
                self._update_matrix_profile(
                    values,
                    self._range[:num_values],
                    self._matrix_profile_right[diag:diag + num_values],
                    self._profile_index_right[diag:diag + num_values])
            else:
                self._update_matrix_profile(
                    values,
                    self._range[-diag:-diag + num_values],
                    self._matrix_profile_right[:num_values],
                    self._profile_index_right[:num_values])

    def process_column(self, column_index, values):
        values = values[0]
        shift_diff = self.series_shift - self.query_shift

        border = max(0, column_index + 1 + shift_diff)
        if border > 0:
            min_value = np.min(values[:border])

            # In case of shifting, a lower value could already be present
            if min_value < self._matrix_profile_left[column_index]:
                self._matrix_profile_left[column_index] = min_value
                self._profile_index_left[column_index] = np.argmin(values[:border]) + self.query_shift

        if len(values) > border:
            min_value = np.min(values[border:])

            # In case of shifting, a lower value could already be present
            if min_value < self._matrix_profile_right[column_index]:
                self._matrix_profile_right[column_index] = np.min(values[border:])
                self._profile_index_right[column_index] = np.argmin(values[border:]) + border + self.query_shift

    def shift_query(self, amount):
        if amount == 0:
            return

        self.query_shift += amount
        self._range.push(np.arange(self._range[-1] + 1, self._range[-1] + 1 + amount))

    def shift_series(self, amount):
        if amount == 0:
            return

        self.series_shift += amount

        push_values = np.full(amount, np.inf)
        self._matrix_profile_left.push(push_values)
        self._matrix_profile_right.push(push_values)

        push_values[:] = -1
        self._profile_index_left.push(push_values)
        self._profile_index_right.push(push_values)

    @property
    def matrix_profile_right(self):
        return self._matrix_profile_right.view

    @property
    def matrix_profile_left(self):
        return self._matrix_profile_left.view

    @property
    def profile_index_right(self):
        return self._profile_index_right.view

    @property
    def profile_index_left(self):
        return self._profile_index_left.view
    def __init__(self, m, series, query, self_join, noise_std,
                 rb_scale_factor):
        """
        :param m: subsequence length to consider for distance calculations
        :param series: empty ringbuffer, properly sized to contain the desired window for series
        :param query: empty ringbuffer, properly sized to contain the desired window for query, or the same buffer
          as series in case of a self-join
        :param self_join: whether or not a self-join should be done
        :param noise_std: standard deviation of noise on series/query, zero to disable noise cancellation
        :param rb_scale_factor: scaling factor used for internal RingBuffers, for speed/memory tradeoff
        """

        # Core values
        self.m = m
        self.series = series
        self.query = query
        self.noise_std = noise_std
        self.self_join = self_join

        # Derivated values
        num_subseq_s = series.max_shape[-1] - m + 1
        self.mu_s = RingBuffer(None,
                               shape=(num_subseq_s, ),
                               dtype=np.float,
                               scaling_factor=rb_scale_factor)
        self.std_s = RingBuffer(None,
                                shape=(num_subseq_s, ),
                                dtype=np.float,
                                scaling_factor=rb_scale_factor)
        self.std_s_nonzero = RingBuffer(None,
                                        shape=(num_subseq_s, ),
                                        dtype=np.float,
                                        scaling_factor=rb_scale_factor)

        if not self_join:
            num_subseq_q = query.max_shape[-1] - m + 1
            self.mu_q = RingBuffer(None,
                                   shape=(num_subseq_q, ),
                                   dtype=np.float,
                                   scaling_factor=rb_scale_factor)
            self.std_q = RingBuffer(None,
                                    shape=(num_subseq_q, ),
                                    dtype=np.float,
                                    scaling_factor=rb_scale_factor)
            self.std_q_nonzero = RingBuffer(None,
                                            shape=(num_subseq_q, ),
                                            dtype=np.float,
                                            scaling_factor=rb_scale_factor)
        else:
            self.mu_q = self.mu_s
            self.std_q = self.std_s
            self.std_q_nonzero = self.std_s_nonzero

        # Caching
        self.first_row = None
        self.first_row_backlog = 0
        self.prev_calc_column_index = None
        self.prev_calc_column_dot_prod = None
class BoundStreamingFilterGenerator(BoundFilterGenerator,
                                    AbstractBoundStreamingGenerator):
    """
    Wrapper around other generators that will replace values in the distance matrix marked as invalid
    by positive infinity. It can also perform a data pre-processing step before data reaches the wrapped generator,
    by setting values marked as invalid to zero, this can be useful for example to remove nan values for a generator
    that does not support nan values.
    """
    def __init__(self, generator, m, num_s_subseq, num_q_subseq,
                 invalid_data_function, rb_scale_factor):
        """
        Creates a new generator by wrapping another generator.

        :param generator: the generator whose results and input data will be filtered
        :param invalid_data_function: optional - a function that takes in the original data (series or query) and
           subsequence length and returns a boolean array of the same size that has a True value for any invalid values.
           These values will be replaced by zeros before reaching the wrapped generator. Any distance values
           that were calculated using invalid data points will be positive infinite values.
        """

        self._invalid_data_function = invalid_data_function

        invalid_s_subseq_buffer = RingBuffer(None,
                                             shape=(num_s_subseq, ),
                                             dtype=np.bool,
                                             scaling_factor=rb_scale_factor)

        self.invalid_series = RingBuffer(None,
                                         shape=(num_s_subseq + m - 1, ),
                                         dtype=np.bool,
                                         scaling_factor=rb_scale_factor)

        if num_q_subseq is None:
            self.self_join = True
            invalid_q_subseq_buffer = invalid_s_subseq_buffer
            num_q_subseq = num_s_subseq
            self.invalid_query = self.invalid_series
        else:
            self.self_join = False

            invalid_q_subseq_buffer = RingBuffer(
                None,
                shape=(num_q_subseq, ),
                dtype=np.bool,
                scaling_factor=rb_scale_factor)
            self.invalid_query = RingBuffer(None,
                                            shape=(num_q_subseq + m - 1, ),
                                            dtype=np.bool,
                                            scaling_factor=rb_scale_factor)

        super().__init__(generator, m, num_q_subseq, invalid_s_subseq_buffer,
                         invalid_q_subseq_buffer)

    def append_series(self, values):
        invalid_points = _apply_data_validation(values, self.m,
                                                self._invalid_data_function)
        self.invalid_series.push(invalid_points)

        if np.any(invalid_points):
            values = values.copy()
            values[invalid_points] = 0

        if len(self.invalid_series.view) >= self.m:
            rel_values = self.invalid_series[-(len(values) + self.m - 1):]
            self.invalid_series_subseq.push(
                np.any(sliding_window_view(rel_values, (self.m, )), axis=-1))

        self.generator.append_series(values)

    def append_query(self, values):
        if self.self_join:
            raise RuntimeError("Cannot append to query for a self-join.")

        invalid_points = _apply_data_validation(values, self.m,
                                                self._invalid_data_function)
        self.invalid_query.push(invalid_points)

        if np.any(invalid_points):
            values = values.copy()
            values[invalid_points] = 0

        if len(self.invalid_query.view) >= self.m:
            rel_values = self.invalid_query[-(len(values) + self.m - 1):]
            self.invalid_query_subseq.push(
                np.any(sliding_window_view(rel_values, (self.m, )), axis=-1))

        self.generator.append_query(values)

    def calc_column(self, column):
        if self.invalid_series_subseq[column]:
            return np.full(len(self.invalid_query_subseq.view), np.Inf)

        distances = self.generator.calc_column(column)
        distances[self.invalid_query_subseq.view] = np.Inf

        return distances
Example #25
0
class BoundZNormEuclidean(AbstractBoundStreamingGenerator):
    def __init__(self, m, series, query, self_join, noise_std, series_mu, series_std, series_std_nz,
                 query_mu, query_std, query_std_nz,):
        """
        :param m: subsequence length to consider for distance calculations
        :param series: empty ringbuffer, properly sized to contain the desired window for series
        :param query: empty ringbuffer, properly sized to contain the desired window for query, or the same buffer
          as series in case of a self-join
        :param self_join: whether or not a self-join should be done
        :param noise_std: standard deviation of noise on series/query, zero to disable noise cancellation
        """

        # Core values
        self.m = m
        self.series = series
        self.query = query
        self.noise_std = noise_std
        self.self_join = self_join

        # Derivated values
        self.mu_s = series_mu
        self.std_s = series_std
        self.std_s_nonzero = series_std_nz

        self.mu_q = query_mu
        self.std_q = query_std
        self.std_q_nonzero = query_std_nz

        # Caching
        self.first_row = None
        self.first_row_backlog = 0
        self.prev_calc_column_index = None
        self.prev_calc_column_dot_prod = None

    def append_series(self, values):
        if len(values) == 0:
            return

        data_dropped = self.series.push(values)
        num_dropped = len(values) - (self.series.max_shape[0] - self.series.view.shape[0])
        self.first_row_backlog += len(values)

        if len(self.series.view) >= self.m:
            num_affected = len(values) + self.m - 1
            new_mu, new_std = sliding_mean_std(self.series[-num_affected:], self.m)
            self.mu_s.push(new_mu)
            self.std_s.push(new_std)
            self.std_s_nonzero.push(new_std != 0.)

        if self.prev_calc_column_index is not None and num_dropped > 0:
            self.prev_calc_column_index -= num_dropped

        if self.self_join:
            if data_dropped:
                self.first_row = None  # The first row was dropped by new data
            self.prev_calc_column_index = None

    def append_query(self, values):
        if self.self_join:
            raise RuntimeError("Cannot append query data in case of a self join.")

        if len(values) == 0:
            return

        if self.query.push(values):
            self.first_row = None  # The first row was dropped by new data
        self.prev_calc_column_index = None

        if len(self.query.view) >= self.m:
            num_affected = len(values) + self.m - 1
            new_mu, new_std = sliding_mean_std(self.query[-num_affected:], self.m)
            self.mu_q.push(new_mu)
            self.std_q.push(new_std)
            self.std_q_nonzero.push(new_std != 0.)

    def calc_diagonal(self, diag):
        dl = diag_length(len(self.query.view), len(self.series.view), diag)  # Number of affected data points
        dlr = dl - self.m + 1  # Number of entries in diagonal
        cumsum = np.zeros(dl + 1, dtype=np.float)

        if diag >= 0:
            # Eg: for diag = 2:
            # D = (y0 * x2), (y1 * x3), (y2 * x4)...
            # cumsum = 0, D0, D0+D1, D0+D1+D2, ...
            cumsum[1:] = np.cumsum(self.query[:dl] * self.series[diag: diag + dl])
            q_range = slice(0, dlr)
            s_range = slice(diag, diag + dlr)
        else:
            # Eg: for diag = -2:
            # D = (y2 * x0), (y3 * x1), (y4 * x2)...
            # cumsum = 0, D0, D0+D1, D0+D1+D2, ...
            cumsum[1:] = np.cumsum(self.query[-diag: -diag + dl] * self.series[:dl])
            s_range = slice(0, dlr)
            q_range = slice(-diag, -diag + dlr)

        mean_q = self.mu_q[q_range]
        mean_s = self.mu_s[s_range]
        std_q = self.std_q[q_range]
        std_s = self.std_s[s_range]

        dot_prod = cumsum[self.m:] - cumsum[:dlr]

        dist_sq = np.zeros(dlr, dtype=np.float)
        non_zero_std_q = self.std_q_nonzero[q_range]
        non_zero_std_s = self.std_s_nonzero[s_range]

        # For subsequences where both signals are stable (std = 0), we define the distance as zero.
        # This is covered by the initialization of the dist array.
        # For subsequences where exactly one signal is stable, the distance is sqrt(m) by definition.
        dist_sq[np.logical_xor(non_zero_std_q, non_zero_std_s)] = self.m

        # Formula for regular (non-stable) subsequences
        mask = np.logical_and(non_zero_std_q, non_zero_std_s)
        dist_sq[mask] = (2 * (self.m - (dot_prod[mask] - self.m * mean_q[mask] * mean_s[mask]) /
                              (std_q[mask] * std_s[mask])))

        # Noise correction - See paper "Eliminating noise in the matrix profile"
        if self.noise_std != 0.:
            mask = np.logical_or(non_zero_std_q, non_zero_std_s)
            dist_sq[mask] -= (2 * (self.m + 1) * np.square(self.noise_std) /
                              np.square(np.maximum(std_s[mask], std_q[mask])))

        # Before the noise correction, small negative values are possible due to rounding.
        # After the noise, larger negative values are also possible.
        # Correct all negative values to zero.
        dist_sq[dist_sq < _EPS] = 0

        return np.sqrt(dist_sq)

    def calc_column(self, column):
        dist_sq = np.zeros(len(self.query.view) - self.m + 1, dtype=np.float)
        series_subseq = self.series[column: column + self.m]

        if self.prev_calc_column_index != column - 1 or column == 0:
            # Previous column not cached or data for incremental calculation not available: full calculation
            dot_prod = fftconvolve(self.query.view, series_subseq[::-1], 'valid')
        else:
            # Previous column cached, reuse it
            if self.first_row is None:
                first_query = self.query[0:self.m]
                self.first_row = RingBuffer(fftconvolve(self.series.view, first_query[::-1], 'valid'),
                                            shape=(self.series.max_shape[0] - self.m + 1,))
                self.first_row_backlog = 0
            elif self.first_row_backlog > 0:
                # Series has been updated since last calculation of first_row
                elems_to_recalc = self.first_row_backlog + self.m - 1
                first_query = self.query[0:self.m]
                self.first_row.push(fftconvolve(self.series[-elems_to_recalc:], first_query[::-1], 'valid'))
                self.first_row_backlog = 0

            dot_prod = self.prev_calc_column_dot_prod  # work in same array
            dot_prod[1:] = (self.prev_calc_column_dot_prod[:-1]
                            - self.series[column - 1] * self.query[:len(self.query.view) - self.m]
                            + self.series[column + self.m - 1] * self.query[self.m:])
            dot_prod[0] = self.first_row[column]

        self.prev_calc_column_dot_prod = dot_prod
        self.prev_calc_column_index = column

        if self.std_s[column] != 0:
            q_valid = self.std_q.view != 0

            # Series subsequence is not stable, if query subsequence is stable, the distance is sqrt(m) by definition.
            dist_sq[~q_valid] = self.m

            dist_sq[q_valid] = 2 * (self.m - (dot_prod[q_valid] - self.m * self.mu_q[q_valid] * self.mu_s[column]) /
                                    (self.std_q[q_valid] * self.std_s[column]))
        else:
            # Series subsequence is stable, results are either sqrt(m) or 0, depending on whether or not
            # query subsequences are stable as well.

            dist_sq[self.std_q.view != 0] = self.m
            # dist_sq[self.std_q == 0] = 0  # Covered by array initialization

        # Noise correction - See paper "Eliminating noise in the matrix profile"
        if self.noise_std != 0.:
            if self.std_s[column] != 0:
                mask = slice(None)
            else:
                mask = self.std_q != 0

            dist_sq[mask] -= (2 * (self.m + 1) * np.square(self.noise_std) /
                              np.square(np.maximum(self.std_s[column], self.std_q[mask])))

        # Before the noise correction, small negative values are possible due to rounding.
        # After the noise, larger negative values are also possible.
        # Correct all negative values to zero.
        dist_sq[dist_sq < _EPS] = 0

        return np.sqrt(dist_sq)

    def calc_single(self, row, column, dot_prod=None):
        """
        Calculates a single point of the distance matrix.

        :param row: index of the subsequence in the query series
        :param column: index of the subsequence in the data series
        :param dot_prod: the dotproduct of the subsequences, if provided, this method can run in constant time
        :return: z-normalised distance of the 2 subsequences
        """
        std_q = self.std_q[row]
        std_s = self.std_s[column]

        if std_q == 0. and std_s == 0.:
            return 0.

        if std_q == 0. or std_s == 0.:
            return self.m

        if not dot_prod:
            dot_prod = np.sum(self.query[row: row+self.m] * self.series[column: column+self.m])
        mean_q = self.mu_q[row]
        mean_s = self.mu_s[column]

        dist_sq = 2 * (self.m - (dot_prod - self.m * mean_q * mean_s) / (std_q * std_s))

        if self.noise_std != 0.:
            dist_sq -= (2 * (self.m + 1) * np.square(self.noise_std) / np.square(np.maximum(std_s, std_q)))

        if dist_sq < _EPS:
            return 0.
        else:
            return np.sqrt(dist_sq)
Example #26
0
    def calc_column(self, column):
        dist_sq = np.zeros(len(self.query.view) - self.m + 1, dtype=np.float)
        series_subseq = self.series[column: column + self.m]

        if self.prev_calc_column_index != column - 1 or column == 0:
            # Previous column not cached or data for incremental calculation not available: full calculation
            dot_prod = fftconvolve(self.query.view, series_subseq[::-1], 'valid')
        else:
            # Previous column cached, reuse it
            if self.first_row is None:
                first_query = self.query[0:self.m]
                self.first_row = RingBuffer(fftconvolve(self.series.view, first_query[::-1], 'valid'),
                                            shape=(self.series.max_shape[0] - self.m + 1,))
                self.first_row_backlog = 0
            elif self.first_row_backlog > 0:
                # Series has been updated since last calculation of first_row
                elems_to_recalc = self.first_row_backlog + self.m - 1
                first_query = self.query[0:self.m]
                self.first_row.push(fftconvolve(self.series[-elems_to_recalc:], first_query[::-1], 'valid'))
                self.first_row_backlog = 0

            dot_prod = self.prev_calc_column_dot_prod  # work in same array
            dot_prod[1:] = (self.prev_calc_column_dot_prod[:-1]
                            - self.series[column - 1] * self.query[:len(self.query.view) - self.m]
                            + self.series[column + self.m - 1] * self.query[self.m:])
            dot_prod[0] = self.first_row[column]

        self.prev_calc_column_dot_prod = dot_prod
        self.prev_calc_column_index = column

        if self.std_s[column] != 0:
            q_valid = self.std_q.view != 0

            # Series subsequence is not stable, if query subsequence is stable, the distance is sqrt(m) by definition.
            dist_sq[~q_valid] = self.m

            dist_sq[q_valid] = 2 * (self.m - (dot_prod[q_valid] - self.m * self.mu_q[q_valid] * self.mu_s[column]) /
                                    (self.std_q[q_valid] * self.std_s[column]))
        else:
            # Series subsequence is stable, results are either sqrt(m) or 0, depending on whether or not
            # query subsequences are stable as well.

            dist_sq[self.std_q.view != 0] = self.m
            # dist_sq[self.std_q == 0] = 0  # Covered by array initialization

        # Noise correction - See paper "Eliminating noise in the matrix profile"
        if self.noise_std != 0.:
            if self.std_s[column] != 0:
                mask = slice(None)
            else:
                mask = self.std_q != 0

            dist_sq[mask] -= (2 * (self.m + 1) * np.square(self.noise_std) /
                              np.square(np.maximum(self.std_s[column], self.std_q[mask])))

        # Before the noise correction, small negative values are possible due to rounding.
        # After the noise, larger negative values are also possible.
        # Correct all negative values to zero.
        dist_sq[dist_sq < _EPS] = 0

        return np.sqrt(dist_sq)