Example #1
0
 def test_receive_data(self):
     bucket = DataBucket()
     data_val = list(range(6))
     timestamp_list = create_list_of_timestamps(len(data_val))
     for val in data_val:
         bucket.receive_data(get_pd_dataframe([val], [1523889000000 + val]))
     for idx, row in bucket.data.iterrows():
         self.assertEqual(data_val[idx], row['value'])
         self.assertEqual(timestamp_list[idx], row['timestamp'])
Example #2
0
 def test_drop_data(self):
     bucket = DataBucket()
     data_val = list(range(10))
     timestamp_list = create_list_of_timestamps(len(data_val))
     bucket.receive_data(get_pd_dataframe(data_val, timestamp_list))
     bucket.drop_data(5)
     expected_data = data_val[5:]
     expected_timestamp = timestamp_list[5:]
     self.assertEqual(expected_data, bucket.data['value'].tolist())
     self.assertEqual(expected_timestamp, bucket.data['timestamp'].tolist())
Example #3
0
 def __init__(self, pattern_type: str, analytic_unit_id: AnalyticUnitId):
     super().__init__(analytic_unit_id)
     self.pattern_type = pattern_type
     self.model = resolve_model_by_pattern(self.pattern_type)
     self.bucket = DataBucket()
Example #4
0
class PatternDetector(Detector):

    MIN_BUCKET_SIZE = 150
    BUCKET_WINDOW_SIZE_FACTOR = 5
    DEFAULT_WINDOW_SIZE = 1

    def __init__(self, pattern_type: str, analytic_unit_id: AnalyticUnitId):
        super().__init__(analytic_unit_id)
        self.pattern_type = pattern_type
        self.model = resolve_model_by_pattern(self.pattern_type)
        self.bucket = DataBucket()

    def train(self, dataframe: pd.DataFrame, segments: List[Segment],
              cache: Optional[ModelCache]) -> ModelCache:
        # TODO: pass only part of dataframe that has segments

        if self.contains_labeled_segments(segments) == False:
            msg = f'{self.analytic_unit_id} has no positive labeled segments. Pattern detector needs at least 1 positive labeled segment'
            logger.error(msg)
            raise ValueError(msg)

        self.model.state: models.ModelState = self.model.get_state(cache)
        new_cache: models.ModelState = self.model.fit(dataframe, segments,
                                                      self.analytic_unit_id)

        # time step is optional
        if len(dataframe) > 1:
            new_cache.time_step = utils.find_interval(dataframe)

        new_cache = new_cache.to_json()
        if len(new_cache) == 0:
            logging.warning(
                'new_cache is empty with data: {}, segments: {}, cache: {}, analytic unit: {}'
                .format(dataframe, segments, cache, self.analytic_unit_id))
        return {'cache': new_cache}

    def detect(self, dataframe: pd.DataFrame,
               cache: Optional[ModelCache]) -> DetectionResult:
        logger.debug('Unit {} got {} data points for detection'.format(
            self.analytic_unit_id, len(dataframe)))
        # TODO: split and sleep (https://github.com/hastic/hastic-server/pull/124#discussion_r214085643)

        if cache is None:
            msg = f'{self.analytic_unit_id} detection got invalid cache, skip detection'
            logger.error(msg)
            raise ValueError(msg)

        self.model.state = self.model.get_state(cache)
        window_size = self.model.state.window_size

        if window_size is None:
            message = '{} got cache without window_size for detection'.format(
                self.analytic_unit_id)
            logger.error(message)
            raise ValueError(message)

        if len(dataframe) < window_size * 2:
            message = f'{self.analytic_unit_id} skip detection: dataset length {len(dataframe)} points less than minimal length {window_size * 2} points'
            logger.error(message)
            raise ValueError(message)

        detected = self.model.detect(dataframe, self.analytic_unit_id)

        segments = [
            Segment(segment[0], segment[1]) for segment in detected['segments']
        ]
        new_cache = detected['cache'].to_json()
        last_dataframe_time = dataframe.iloc[-1]['timestamp']
        last_detection_time = convert_pd_timestamp_to_ms(last_dataframe_time)
        return DetectionResult(new_cache, segments, last_detection_time)

    def consume_data(self, data: pd.DataFrame,
                     cache: Optional[ModelCache]) -> Optional[DetectionResult]:
        logging.debug('Start consume_data for analytic unit {}'.format(
            self.analytic_unit_id))

        if cache is None:
            logging.debug(
                f'consume_data get invalid cache {cache} for task {self.analytic_unit_id}, skip'
            )
            return None

        data_without_nan = data.dropna()

        if len(data_without_nan) == 0:
            return None

        # TODO: use ModelState
        window_size = cache['windowSize']
        bucket_max_size = max(window_size * self.BUCKET_WINDOW_SIZE_FACTOR,
                              self.MIN_BUCKET_SIZE)

        self.bucket.set_max_size(bucket_max_size)
        self.bucket.append_data(data_without_nan)

        bucket_size = self.bucket.get_current_size()
        if bucket_size < window_size * 2:
            msg = f'{self.analytic_unit_id} bucket data {bucket_size} less than two window size {window_size * 2}, skip run detection from consume_data'
            logger.debug(msg)
            return None

        res = self.detect(self.bucket.data, cache)

        logging.debug(
            'End consume_data for analytic unit: {} with res: {}'.format(
                self.analytic_unit_id, str(res.to_json())))

        if res:
            return res
        else:
            return None

    def get_window_size(self, cache: Optional[ModelCache]) -> int:
        if cache is None: return self.DEFAULT_WINDOW_SIZE
        # TODO: windowSize -> window_size
        return cache.get('windowSize', self.DEFAULT_WINDOW_SIZE)

    def contains_labeled_segments(self, segments: List[Segment]) -> bool:
        for segment in segments:
            if segment.labeled == True:
                return True
        return False
 def __init__(self, analytic_unit_id: AnalyticUnitId):
     super().__init__(analytic_unit_id)
     self.bucket = DataBucket()
class AnomalyDetector(ProcessingDetector):

    def __init__(self, analytic_unit_id: AnalyticUnitId):
        super().__init__(analytic_unit_id)
        self.bucket = DataBucket()

    def train(self, dataframe: pd.DataFrame, payload: Union[list, dict], cache: Optional[ModelCache]) -> ModelCache:
        cache = AnomalyCache.from_json(payload)
        cache.time_step = utils.find_interval(dataframe)
        segments = cache.segments

        if len(segments) > 0:
            seasonality = cache.seasonality
            prepared_segments = []

            for segment in segments:
                segment_len = (int(segment.to_timestamp) - int(segment.from_timestamp))
                assert segment_len <= seasonality, \
                    f'seasonality {seasonality} must be greater than segment length {segment_len}'

                from_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.from_timestamp, unit='ms'))
                to_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.to_timestamp, unit='ms'))
                segment_data = dataframe[from_index : to_index]
                prepared_segments.append(
                    AnomalyDetectorSegment(
                        segment.from_timestamp,
                        segment.to_timestamp,
                        segment_data.value.tolist()
                    )
                )
            cache.set_segments(prepared_segments)

        return {
            'cache': cache.to_json()
        }

    # TODO: ModelCache -> DetectorState
    def detect(self, dataframe: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult:
        if cache == None:
            raise f'Analytic unit {self.analytic_unit_id} got empty cache'
        data = dataframe['value']

        cache = AnomalyCache.from_json(cache)
        segments = cache.segments
        enabled_bounds = cache.get_enabled_bounds()

        smoothed_data = utils.exponential_smoothing(data, cache.alpha)

        lower_bound = smoothed_data - cache.confidence
        upper_bound = smoothed_data + cache.confidence

        if len(segments) > 0:
            data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0])

            for segment in segments:
                seasonality_index = cache.seasonality // cache.time_step
                seasonality_offset = self.get_seasonality_offset(
                    segment.from_timestamp,
                    cache.seasonality,
                    data_start_time,
                    cache.time_step
                )
                segment_data = pd.Series(segment.data)

                lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER)
                upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER)

        detected_segments = list(self.detections_generator(dataframe, upper_bound, lower_bound, enabled_bounds))

        last_dataframe_time = dataframe.iloc[-1]['timestamp']
        last_detection_time = utils.convert_pd_timestamp_to_ms(last_dataframe_time)

        return DetectionResult(cache.to_json(), detected_segments, last_detection_time)

    def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]:
        if cache is None:
            msg = f'consume_data got invalid cache {cache} for task {self.analytic_unit_id}'
            logging.debug(msg)
            raise ValueError(msg)

        data_without_nan = data.dropna()

        if len(data_without_nan) == 0:
            return None

        self.bucket.receive_data(data_without_nan)

        if len(self.bucket.data) >= self.get_window_size(cache):
            return self.detect(self.bucket.data, cache)

        return None

    def is_detection_intersected(self) -> bool:
        return False

    def get_window_size(self, cache: Optional[ModelCache]) -> int:
        '''
        get the number of values that will affect the next value
        '''

        if cache is None:
            raise ValueError('anomaly detector got None cache')
        cache = AnomalyCache.from_json(cache)

        for level in range(1, MAX_DEPENDENCY_LEVEL):
            if (1 - cache.alpha) ** level < MIN_DEPENDENCY_FACTOR:
                break

        seasonality = 0
        if len(cache.segments) > 0:
            seasonality = cache.seasonality // cache.time_step
        return max(level, seasonality)

    def concat_detection_results(self, detections: List[DetectionResult]) -> DetectionResult:
        result = DetectionResult()
        time_step = detections[0].cache['timeStep']
        for detection in detections:
            result.segments.extend(detection.segments)
            result.last_detection_time = detection.last_detection_time
            result.cache = detection.cache
        result.segments = utils.merge_intersecting_segments(result.segments, time_step)
        return result

    # TODO: remove duplication with detect()
    def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> ProcessingResult:
        cache = AnomalyCache.from_json(cache)
        segments = cache.segments
        enabled_bounds =  cache.get_enabled_bounds()

        # TODO: exponential_smoothing should return dataframe with related timestamps
        smoothed_data = utils.exponential_smoothing(dataframe['value'], cache.alpha)

        lower_bound = smoothed_data - cache.confidence
        upper_bound = smoothed_data + cache.confidence

        if len(segments) > 0:
            data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0])

            for segment in segments:
                seasonality_index = cache.seasonality // cache.time_step
                # TODO: move it to utils and add tests
                seasonality_offset = self.get_seasonality_offset(
                    segment.from_timestamp,
                    cache.seasonality,
                    data_start_time,
                    cache.time_step
                )
                segment_data = pd.Series(segment.data)

                lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER)
                upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER)

                # TODO: support multiple segments

        timestamps = utils.convert_series_to_timestamp_list(dataframe.timestamp)
        lower_bound_timeseries = list(zip(timestamps, lower_bound.values.tolist()))
        upper_bound_timeseries = list(zip(timestamps, upper_bound.values.tolist()))

        if enabled_bounds == Bound.ALL:
            return ProcessingResult(lower_bound_timeseries, upper_bound_timeseries)
        elif enabled_bounds == Bound.UPPER:
            return ProcessingResult(upper_bound = upper_bound_timeseries)
        elif enabled_bounds == Bound.LOWER:
            return ProcessingResult(lower_bound = lower_bound_timeseries)

    def add_season_to_data(self, data: pd.Series, segment: pd.Series, offset: int, seasonality: int, bound_type: Bound) -> pd.Series:
        #data - smoothed data to which seasonality will be added
        #if addition == True -> segment is added
        #if addition == False -> segment is subtracted
        len_smoothed_data = len(data)
        for idx, _ in enumerate(data):
            if idx - offset < 0:
                #TODO: add seasonality for non empty parts
                continue
            if (idx - offset) % seasonality == 0:
                if bound_type == Bound.UPPER:
                    upper_segment_bound = self.get_bounds_for_segment(segment)[0]
                    data = data.add(pd.Series(upper_segment_bound.values, index = segment.index + idx), fill_value = 0)
                elif bound_type == Bound.LOWER:
                    lower_segment_bound = self.get_bounds_for_segment(segment)[1]
                    data = data.add(pd.Series(lower_segment_bound.values * -1, index = segment.index + idx), fill_value = 0)
                else:
                    raise ValueError(f'unknown bound type: {bound_type.value}')

        return data[:len_smoothed_data]

    def get_bounds_for_segment(self, segment: pd.Series) -> Tuple[pd.Series, pd.Series]:
        '''
        segment is divided by the median to determine its top and bottom parts
        parts are smoothed and raised so the segment is between them
        '''
        if len(segment) < 2:
            return segment, segment
        segment = segment - segment.min()
        segment_median = segment.median()
        top_part = []
        bottom_part = []
        for val in segment.values:
            if val > segment_median:
                top_part.append(val)
                bottom_part.append(segment_median)
            else:
                bottom_part.append(val)
                top_part.append(segment_median)
        top_part = pd.Series(top_part, index = segment.index)
        bottom_part = pd.Series(bottom_part, index = segment.index)
        smoothed_top_part = utils.exponential_smoothing(top_part, BASIC_ALPHA)
        smoothed_bottom_part = utils.exponential_smoothing(bottom_part, BASIC_ALPHA)
        top_difference = []
        bottom_difference = []
        for idx, val in enumerate(top_part):
            top_difference.append(abs(val - smoothed_top_part[idx]))
            bottom_difference.append(abs(bottom_part[idx] - smoothed_bottom_part[idx]))
        max_diff_top = max(top_difference)
        max_diff_bot = max(bottom_difference)
        upper_bound = []
        lower_bound = []
        for val in smoothed_top_part.values:
            upper_bound.append(val + max_diff_top)
        for val in smoothed_bottom_part.values:
            lower_bound.append(val + max_diff_bot)
        upper_bound = pd.Series(upper_bound, index = segment.index)
        lower_bound = pd.Series(lower_bound, index = segment.index)
        return upper_bound, lower_bound

    def get_seasonality_offset(self, from_timestamp: int, seasonality: int, data_start_time: int, time_step: int) -> int:
        season_count = math.ceil(abs(from_timestamp - data_start_time) / seasonality)
        start_seasonal_segment = from_timestamp + seasonality * season_count
        seasonality_time_offset = abs(start_seasonal_segment - data_start_time) % seasonality
        seasonality_offset = math.ceil(seasonality_time_offset / time_step)
        return seasonality_offset

    def detections_generator(
        self,
        dataframe: pd.DataFrame,
        upper_bound: pd.DataFrame,
        lower_bound: pd.DataFrame,
        enabled_bounds: Bound
    ) -> Generator[Segment, None, Segment]:
        in_segment = False
        segment_start = 0
        bound: Bound = None
        for idx, val in enumerate(dataframe['value'].values):
            if val > upper_bound.values[idx]:
                if enabled_bounds == Bound.UPPER or enabled_bounds == Bound.ALL:
                    if not in_segment:
                        in_segment = True
                        segment_start = dataframe['timestamp'][idx]
                        bound = Bound.UPPER
                    continue

            if val < lower_bound.values[idx]:
                if enabled_bounds == Bound.LOWER or enabled_bounds == Bound.ALL:
                    if not in_segment:
                        in_segment = True
                        segment_start = dataframe['timestamp'][idx]
                        bound = Bound.LOWER
                    continue

            if in_segment:
                segment_end = dataframe['timestamp'][idx - 1]
                yield Segment(
                    utils.convert_pd_timestamp_to_ms(segment_start),
                    utils.convert_pd_timestamp_to_ms(segment_end),
                    message=f'{val} out of {str(bound.value)} bound'
                )
                in_segment = False
        else:
            if in_segment:
                segment_end = dataframe['timestamp'][idx]
                return Segment(
                    utils.convert_pd_timestamp_to_ms(segment_start),
                    utils.convert_pd_timestamp_to_ms(segment_end),
                    message=f'{val} out of {str(bound.value)} bound'
                )