def testClusterAndFindSplit_N_Pattern(self):
        # In this test case we're ensuring that permutation testing is finding the
        # local mimima for a sub-segment. We're introducing randomness here but
        # seeding well-known inflection points to make it clear that we're able to
        # see those inflection points.
        sequence = (
            # First we have a sequence of numbers in [100..200] with mode = 150.
            [random.triangular(100, 200, 150) for _ in range(49)]

            # Then we see our first inflection point.
            + [300]

            # Then we have a sequence of numbers in [300..350] with mode = 325.
            + [random.triangular(300, 350, 325) for _ in range(49)]

            # Then we see our next inflection point.
            + [400]

            # Then finally we have a sequence of numbers in [400..500] with mode =
            # 500.
            + [random.triangular(400, 500, 450) for _ in range(100)])
        splits = ccd.ClusterAndFindSplit(sequence, self.rand)
        logging.debug('Splits = %s', splits)

        # Instead of asserting that we have specific indices, we're testing that the
        # splits found are within certain ranges.
        self.assertTrue(any(50 <= c < 100 for c in splits))
 def testClusterAndFindSplit_SpikeAndLevelChange(self):
     # We actually can identify the spike, the drop, and the level change.
     sequence = ([1] * 50) + [1000] * 10 + [1] * 50 + ([500] * 50)
     splits = ccd.ClusterAndFindSplit(sequence, self.rand)
     logging.debug('Splits = %s', splits)
     self.assertEqual([(50, (48, 50)), (60, (60, 60)), (110, (108, 113))],
                      splits)
 def testClusterAndFindSplit_Slope(self):
     # E-divisive can identify spikes very well, but it won't pass permutation
     # tests because spikes is not significant enough to identify as
     # adistribution change.
     sequence = ([1] * 15) + [800] + ([1000] * 20)
     splits = ccd.ClusterAndFindSplit(sequence, self.rand)
     self.assertIn((15, (15, 16)), splits)
 def testClusterAndFindSplit_N_Pattern(self):
     # In this test case we're ensuring that permutation testing is finding the
     # local mimima for a sub-segment.
     sequence = range(100, 200) + range(200, 100, -1) + [300] * 10
     splits = ccd.ClusterAndFindSplit(sequence, self.rand)
     logging.debug('Splits = %s', splits)
     self.assertIn(200, splits)
     self.assertTrue(any(c < 200 for c in splits))
 def testClusterAndFindSplit_Spikes(self):
     # E-divisive can identify spikes very well, but it won't pass permutation
     # tests because spikes is not significant enough to identify as
     # adistribution change.
     sequence = ([1] * 15) + [500, 1000, 500] + ([1] * 15)
     self.assertRaises(
         ccd.InsufficientData,
         lambda: ccd.ClusterAndFindSplit(sequence, self.rand),
     )
 def testClusterAndFindSplit_InifiniteLooper(self):
     # We construct a case where we find a clear partition point in offset 240,
     # but permutation testing of the segment [0:240] will find more plausible
     # points. The important part is that we don't run into an infinite loop.
     sequence = [100] * 119 + [150] + [100] * 120 + [200] * 2
     splits = ccd.ClusterAndFindSplit(sequence, self.rand)
     logging.debug('Splits = %s', splits)
     self.assertIn(240, splits)
     self.assertEqual(sequence[240], 200)
     self.assertIn(119, splits)
     self.assertEqual(sequence[119], 150)
Exemple #7
0
    def testClusterAndFindSplit_Windowing(self):
        # We contrive a case where we'd like to find change points by doing a
        # sliding window over steps, and finding each step point.
        master_sequence = ([1] * 100) + ([10] * 100) + ([1] * 100)

        def SlidingWindow(sequence, window_size, step):
            for i in itertools.count(0, step):
                if i + window_size > len(sequence):
                    return
                yield sequence[i:i + window_size]

        collected_indices = set()
        for index_offset, sequence in enumerate(
                SlidingWindow(master_sequence, 50, 10)):
            try:
                split_index = (index_offset * 10) + ccd.ClusterAndFindSplit(
                    sequence, 6, self.rand)
                collected_indices.add(split_index)
            except ccd.InsufficientData:
                continue
        self.assertEqual(collected_indices, {99, 199})
Exemple #8
0
 def testClusterAndFindSplit_Steps(self):
     # We actually can find the first step very well.
     sequence = ([1] * 10) + ([2] * 10) + ([1] * 10)
     split = ccd.ClusterAndFindSplit(sequence, 6, self.rand)
     self.assertEqual(split, 9)
Exemple #9
0
 def testClusterAndFindSplit_Simple(self):
     # This tests that we can find a change point in a contrived scenario.
     sequence = ([1] * 10) + ([10] * 10)
     split = ccd.ClusterAndFindSplit(sequence, 6, self.rand)
     self.assertEqual(split, 9)
def FindChangePoints(series,
                     max_window_size=_MAX_WINDOW_SIZE,
                     min_segment_size=MIN_SEGMENT_SIZE,
                     min_absolute_change=_MIN_ABSOLUTE_CHANGE,
                     min_relative_change=_MIN_RELATIVE_CHANGE,
                     min_steppiness=_MIN_STEPPINESS,
                     multiple_of_std_dev=_MULTIPLE_OF_STD_DEV):
    """Finds at most one change point in the given series.

  Only the last |max_window_size| points are examined, regardless of
  how many points are passed in. The reason why it might make sense to
  limit the number of points to look at is that if there are multiple
  change-points in the window that's looked at, then this function will
  be less likely to find any of them.

  First, the "most likely" split is found. The most likely split is defined as
  a split that minimizes the sum of the variances on either side. Then the
  split point is checked to see whether it passes the thresholds.

  Args:
    series: A list of (x, y) pairs.
    max_window_size: Number of points to analyze.
    min_segment_size: Min size of segments before or after change point.
    min_absolute_change: Absolute change threshold.
    min_relative_change: Relative change threshold.
    min_steppiness: Threshold for how similar to a step a change point must be.
    multiple_of_std_dev: Threshold for change as multiple of std. deviation.

  Returns:
    A list with one ChangePoint object, or an empty list.
  """
    if len(series) < 2:
        return []  # Not enough points to possibly contain a valid split point.
    series = series[-max_window_size:]
    _, y_values = zip(*series)
    split_index = _FindSplit(y_values)
    alternate_split_index = None
    try:
        alternate_split_index = clustering_change_detector.ClusterAndFindSplit(
            y_values, min_segment_size)
        logging.warning(
            'Alternative found an alternate split at index %s compared to %s (%s)',
            alternate_split_index, split_index,
            'SAME' if alternate_split_index == split_index else 'DIFFERENT')
        logging.debug(
            'Revisions found: alternate = %s (index=%s); current = %s (index=%s)',
            series[alternate_split_index][0], alternate_split_index,
            series[split_index][0], split_index)
    except clustering_change_detector.Error as e:
        # TODO(dberrs): When this is the default, bail out after logging.
        logging.warning('Pinpoint based comparison failed: %s', e)

    make_change_point = _PassesThresholds(
        y_values,
        split_index,
        min_segment_size=min_segment_size,
        min_absolute_change=min_absolute_change,
        min_relative_change=min_relative_change,
        min_steppiness=min_steppiness,
        multiple_of_std_dev=multiple_of_std_dev)
    alternate_make_change_point = _PassesThresholds(
        y_values,
        alternate_split_index,
        min_segment_size=min_segment_size,
        min_absolute_change=min_absolute_change,
        min_relative_change=min_relative_change,
        min_steppiness=min_steppiness,
        multiple_of_std_dev=multiple_of_std_dev
    ) if alternate_split_index is not None else False
    logging.info(
        'Anomaly detection study: current=%s alternate=%s diff=%s',
        'CHANGE_FOUND' if make_change_point else 'NO_CHANGE',
        'CHANGE_FOUND' if alternate_make_change_point else 'NO_CHANGE',
        'SAME' if alternate_split_index == split_index else 'DIFFERENT')

    # TODO(dberris): Make this dependent on the alternate when we change the
    # default algorithm.
    return [MakeChangePoint(series, split_index)] if make_change_point else []
def FindChangePoints(series,
                     max_window_size=defaults.MAX_WINDOW_SIZE,
                     min_segment_size=defaults.MIN_SEGMENT_SIZE,
                     min_absolute_change=defaults.MIN_ABSOLUTE_CHANGE,
                     min_relative_change=defaults.MIN_RELATIVE_CHANGE,
                     min_steppiness=defaults.MIN_STEPPINESS,
                     multiple_of_std_dev=defaults.MULTIPLE_OF_STD_DEV,
                     rand=None):
    """Finds change points in the given series.

  Only the last |max_window_size| points are examined, regardless of how many
  points are passed in. The reason why it might make sense to limit the number
  of points to look at is that if there are multiple change-points in the window
  that's looked at, then this function will be less likely to find any of them.

  This uses a clustering change detector (an approximation of E-divisive) in the
  `clustering_change_detector` module.

  Args:
    series: A list of (x, y) pairs.
    max_window_size: Number of points to analyze.
    min_segment_size: Min size of segments before or after change point.
    min_absolute_change: Absolute change threshold.
    min_relative_change: Relative change threshold.
    min_steppiness: Threshold for how similar to a step a change point must be.
    multiple_of_std_dev: Threshold for change as multiple of std. deviation.

  Returns:
    A list with one ChangePoint object, or an empty list.
  """
    if len(series) < 2:
        return []  # Not enough points to possibly contain a valid split point.
    series = series[-max_window_size:]
    _, y_values = zip(*series)

    candidate_points = []
    try:
        candidate_points = clustering_change_detector.ClusterAndFindSplit(
            y_values, rand=rand)
    except clustering_change_detector.InsufficientData:
        pass

    def RevAndIdx(idx):
        return ('rev:%s' % (series[idx][0], ), 'idx:%s' % (idx, ))

    logging.info('E-Divisive candidate change-points: %s',
                 [RevAndIdx(idx) for idx, _ in candidate_points])
    change_points = []
    for point in reversed(sorted(candidate_points)):
        passed_filter, reject_reason = _PassesThresholds(
            y_values,
            point[0],
            min_segment_size=min_segment_size,
            min_absolute_change=min_absolute_change,
            min_relative_change=min_relative_change,
            min_steppiness=min_steppiness,
            multiple_of_std_dev=multiple_of_std_dev)
        if passed_filter:
            change_points.append(point)
        else:
            logging.debug('Rejected %s as potential index (%s); reason = %s',
                          point, RevAndIdx(point[0]), reject_reason)

    logging.info('E-Divisive potential change-points: %s',
                 [RevAndIdx(idx) for idx, _ in change_points])
    return [MakeChangePoint(series, point) for point in change_points]
 def testClusterAndFindSplit_MinSegmentSizeZero(self):
     sequence = ([1] * 10 + [2] * 10)
     splits = ccd.ClusterAndFindSplit(sequence, self.rand)
     logging.debug('Splits = %s', splits)
     self.assertEqual([10], splits)
 def testClusterAndFindSplit_SpikeAndLevelChange(self):
     # We actually can identify the spike, the drop, and the level change.
     sequence = ([1] * 100) + [1000, 1] + ([2] * 100)
     splits = ccd.ClusterAndFindSplit(sequence, self.rand)
     logging.debug('Splits = %s', splits)
     self.assertEqual([100, 101, 102], splits)
 def testClusterAndFindSplit_Steps(self):
     # We actually can find the first step very well.
     sequence = ([1] * 10) + ([2] * 10) + ([1] * 10)
     splits = ccd.ClusterAndFindSplit(sequence, self.rand)
     self.assertIn((10, (10, 10)), splits)
Exemple #15
0
def FindChangePoints(series,
                     max_window_size=_MAX_WINDOW_SIZE,
                     min_segment_size=MIN_SEGMENT_SIZE,
                     min_absolute_change=_MIN_ABSOLUTE_CHANGE,
                     min_relative_change=_MIN_RELATIVE_CHANGE,
                     min_steppiness=_MIN_STEPPINESS,
                     multiple_of_std_dev=_MULTIPLE_OF_STD_DEV):
    """Finds change points in the given series.

  Only the last |max_window_size| points are examined, regardless of
  how many points are passed in. The reason why it might make sense to
  limit the number of points to look at is that if there are multiple
  change-points in the window that's looked at, then this function will
  be less likely to find any of them.

  This uses two algorithms:

    - A clustering change detector (an approximation of E-divisive) in the
      `clustering_change_detector` module.
    - A variance minimisation change point detection algorithm.

  We run both algorithms, but only use the results from one.

  Args:
    series: A list of (x, y) pairs.
    max_window_size: Number of points to analyze.
    min_segment_size: Min size of segments before or after change point.
    min_absolute_change: Absolute change threshold.
    min_relative_change: Relative change threshold.
    min_steppiness: Threshold for how similar to a step a change point must be.
    multiple_of_std_dev: Threshold for change as multiple of std. deviation.

  Returns:
    A list with one ChangePoint object, or an empty list.
  """
    if len(series) < 2:
        return []  # Not enough points to possibly contain a valid split point.
    series = series[-max_window_size:]
    _, y_values = zip(*series)

    # TODO(dberris): Remove this when we're convinced we no longer need this
    # alternate implementation.
    alternate_split_index = _FindSplit(y_values)
    candidate_indices = []
    split_index = 0

    def RelativeIndexAdjuster(base, offset):
        if base == 0:
            return offset

        # Prevent negative indices.
        return max((base + offset) - min_segment_size, 0)

    # We iteratively find all potential change-points in the range.
    while split_index + min_segment_size < len(y_values):
        try:
            # First we get an adjusted set of indices, starting from split_index,
            # filtering out the ones we've already seen.
            potential_candidates_unadjusted = (
                clustering_change_detector.ClusterAndFindSplit(
                    y_values[max(split_index - min_segment_size, 0):],
                    min_segment_size))
            potential_candidates_unfiltered = [
                RelativeIndexAdjuster(split_index, x)
                for x in potential_candidates_unadjusted
            ]
            potential_candidates = [
                x for x in potential_candidates_unfiltered
                if x not in candidate_indices
            ]
            logging.debug('New indices: %s', potential_candidates)

            if potential_candidates:
                candidate_indices.extend(potential_candidates)
                split_index = max(potential_candidates)
            else:
                break
        except clustering_change_detector.Error as e:
            if not candidate_indices:
                logging.warning('Clustering change point detection failed: %s',
                                e)
                return []
            break

    alternate_make_change_point, alternate_reason = _PassesThresholds(
        y_values,
        alternate_split_index,
        min_segment_size=min_segment_size,
        min_absolute_change=min_absolute_change,
        min_relative_change=min_relative_change,
        min_steppiness=min_steppiness,
        multiple_of_std_dev=multiple_of_std_dev)
    if not alternate_make_change_point:
        logging.warning(
            'Alternate rejected %s as potential index; reason = %s',
            alternate_split_index, alternate_reason)

    def RevAndIdx(idx):
        return ('rev:%s' % (series[idx][0], ), 'idx:%s' % (idx, ))

    logging.info('E-Divisive candidate change-points: %s',
                 [RevAndIdx(idx) for idx in candidate_indices])
    change_points = []
    for potential_index in reversed(sorted(candidate_indices)):
        passed_filter, reject_reason = _PassesThresholds(
            y_values,
            potential_index,
            min_segment_size=min_segment_size,
            min_absolute_change=min_absolute_change,
            min_relative_change=min_relative_change,
            min_steppiness=min_steppiness,
            multiple_of_std_dev=multiple_of_std_dev)
        if passed_filter:
            change_points.append(potential_index)
        else:
            logging.debug('Rejected %s as potential index (%s); reason = %s',
                          potential_index, RevAndIdx(potential_index),
                          reject_reason)

    logging.info('E-Divisive potential change-points: %s',
                 [RevAndIdx(idx) for idx in change_points])
    logging.info(
        'Anomaly detection study: current=%s alternate=%s diff=%s',
        'CHANGE_FOUND' if change_points else 'NO_CHANGE',
        'CHANGE_FOUND' if alternate_make_change_point else 'NO_CHANGE',
        'SAME' if change_points and alternate_split_index == change_points[0]
        else 'DIFFERENT')
    if change_points:
        logging.warning(
            'Alternative found an alternate split at index %s compared to %s (%s)',
            alternate_split_index, change_points[0], 'SAME'
            if alternate_split_index == change_points[0] else 'DIFFERENT')
    return [MakeChangePoint(series, index) for index in change_points[0:1]]
Exemple #16
0
 def testClusterAndFindSplit_Spikes(self):
     # We actually can ignore spikes very well.
     sequence = ([1] * 100) + [1000] + ([1] * 100)
     with self.assertRaises(ccd.InsufficientData):
         split = ccd.ClusterAndFindSplit(sequence, 6, self.rand)
         logging.debug('Split = %s', split)
 def testClusterAndFindSplit_Spikes(self):
     # We actually can identify spikes very well.
     sequence = ([1] * 100) + [1000] + ([1] * 100)
     splits = ccd.ClusterAndFindSplit(sequence, self.rand)
     logging.debug('Splits = %s', splits)
     self.assertEqual([101], splits)
 def testClusterAndFindSplit_Simple(self):
     # This tests that we can find a change point in a contrived scenario.
     sequence = ([1] * 10) + ([10] * 10)
     splits = ccd.ClusterAndFindSplit(sequence, self.rand)
     self.assertIn((10, (10, 10)), splits)