def test_receive_data(self): bucket = DataBucket() data_val = list(range(6)) timestamp_list = create_list_of_timestamps(len(data_val)) for val in data_val: bucket.receive_data(get_pd_dataframe([val], [1523889000000 + val])) for idx, row in bucket.data.iterrows(): self.assertEqual(data_val[idx], row['value']) self.assertEqual(timestamp_list[idx], row['timestamp'])
def test_drop_data(self): bucket = DataBucket() data_val = list(range(10)) timestamp_list = create_list_of_timestamps(len(data_val)) bucket.receive_data(get_pd_dataframe(data_val, timestamp_list)) bucket.drop_data(5) expected_data = data_val[5:] expected_timestamp = timestamp_list[5:] self.assertEqual(expected_data, bucket.data['value'].tolist()) self.assertEqual(expected_timestamp, bucket.data['timestamp'].tolist())
def __init__(self, pattern_type: str, analytic_unit_id: AnalyticUnitId): super().__init__(analytic_unit_id) self.pattern_type = pattern_type self.model = resolve_model_by_pattern(self.pattern_type) self.bucket = DataBucket()
class PatternDetector(Detector): MIN_BUCKET_SIZE = 150 BUCKET_WINDOW_SIZE_FACTOR = 5 DEFAULT_WINDOW_SIZE = 1 def __init__(self, pattern_type: str, analytic_unit_id: AnalyticUnitId): super().__init__(analytic_unit_id) self.pattern_type = pattern_type self.model = resolve_model_by_pattern(self.pattern_type) self.bucket = DataBucket() def train(self, dataframe: pd.DataFrame, segments: List[Segment], cache: Optional[ModelCache]) -> ModelCache: # TODO: pass only part of dataframe that has segments if self.contains_labeled_segments(segments) == False: msg = f'{self.analytic_unit_id} has no positive labeled segments. Pattern detector needs at least 1 positive labeled segment' logger.error(msg) raise ValueError(msg) self.model.state: models.ModelState = self.model.get_state(cache) new_cache: models.ModelState = self.model.fit(dataframe, segments, self.analytic_unit_id) # time step is optional if len(dataframe) > 1: new_cache.time_step = utils.find_interval(dataframe) new_cache = new_cache.to_json() if len(new_cache) == 0: logging.warning( 'new_cache is empty with data: {}, segments: {}, cache: {}, analytic unit: {}' .format(dataframe, segments, cache, self.analytic_unit_id)) return {'cache': new_cache} def detect(self, dataframe: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult: logger.debug('Unit {} got {} data points for detection'.format( self.analytic_unit_id, len(dataframe))) # TODO: split and sleep (https://github.com/hastic/hastic-server/pull/124#discussion_r214085643) if cache is None: msg = f'{self.analytic_unit_id} detection got invalid cache, skip detection' logger.error(msg) raise ValueError(msg) self.model.state = self.model.get_state(cache) window_size = self.model.state.window_size if window_size is None: message = '{} got cache without window_size for detection'.format( self.analytic_unit_id) logger.error(message) raise ValueError(message) if len(dataframe) < window_size * 2: message = f'{self.analytic_unit_id} skip detection: dataset length {len(dataframe)} points less than minimal length {window_size * 2} points' logger.error(message) raise ValueError(message) detected = self.model.detect(dataframe, self.analytic_unit_id) segments = [ Segment(segment[0], segment[1]) for segment in detected['segments'] ] new_cache = detected['cache'].to_json() last_dataframe_time = dataframe.iloc[-1]['timestamp'] last_detection_time = convert_pd_timestamp_to_ms(last_dataframe_time) return DetectionResult(new_cache, segments, last_detection_time) def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]: logging.debug('Start consume_data for analytic unit {}'.format( self.analytic_unit_id)) if cache is None: logging.debug( f'consume_data get invalid cache {cache} for task {self.analytic_unit_id}, skip' ) return None data_without_nan = data.dropna() if len(data_without_nan) == 0: return None # TODO: use ModelState window_size = cache['windowSize'] bucket_max_size = max(window_size * self.BUCKET_WINDOW_SIZE_FACTOR, self.MIN_BUCKET_SIZE) self.bucket.set_max_size(bucket_max_size) self.bucket.append_data(data_without_nan) bucket_size = self.bucket.get_current_size() if bucket_size < window_size * 2: msg = f'{self.analytic_unit_id} bucket data {bucket_size} less than two window size {window_size * 2}, skip run detection from consume_data' logger.debug(msg) return None res = self.detect(self.bucket.data, cache) logging.debug( 'End consume_data for analytic unit: {} with res: {}'.format( self.analytic_unit_id, str(res.to_json()))) if res: return res else: return None def get_window_size(self, cache: Optional[ModelCache]) -> int: if cache is None: return self.DEFAULT_WINDOW_SIZE # TODO: windowSize -> window_size return cache.get('windowSize', self.DEFAULT_WINDOW_SIZE) def contains_labeled_segments(self, segments: List[Segment]) -> bool: for segment in segments: if segment.labeled == True: return True return False
def __init__(self, analytic_unit_id: AnalyticUnitId): super().__init__(analytic_unit_id) self.bucket = DataBucket()
class AnomalyDetector(ProcessingDetector): def __init__(self, analytic_unit_id: AnalyticUnitId): super().__init__(analytic_unit_id) self.bucket = DataBucket() def train(self, dataframe: pd.DataFrame, payload: Union[list, dict], cache: Optional[ModelCache]) -> ModelCache: cache = AnomalyCache.from_json(payload) cache.time_step = utils.find_interval(dataframe) segments = cache.segments if len(segments) > 0: seasonality = cache.seasonality prepared_segments = [] for segment in segments: segment_len = (int(segment.to_timestamp) - int(segment.from_timestamp)) assert segment_len <= seasonality, \ f'seasonality {seasonality} must be greater than segment length {segment_len}' from_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.from_timestamp, unit='ms')) to_index = utils.timestamp_to_index(dataframe, pd.to_datetime(segment.to_timestamp, unit='ms')) segment_data = dataframe[from_index : to_index] prepared_segments.append( AnomalyDetectorSegment( segment.from_timestamp, segment.to_timestamp, segment_data.value.tolist() ) ) cache.set_segments(prepared_segments) return { 'cache': cache.to_json() } # TODO: ModelCache -> DetectorState def detect(self, dataframe: pd.DataFrame, cache: Optional[ModelCache]) -> DetectionResult: if cache == None: raise f'Analytic unit {self.analytic_unit_id} got empty cache' data = dataframe['value'] cache = AnomalyCache.from_json(cache) segments = cache.segments enabled_bounds = cache.get_enabled_bounds() smoothed_data = utils.exponential_smoothing(data, cache.alpha) lower_bound = smoothed_data - cache.confidence upper_bound = smoothed_data + cache.confidence if len(segments) > 0: data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) for segment in segments: seasonality_index = cache.seasonality // cache.time_step seasonality_offset = self.get_seasonality_offset( segment.from_timestamp, cache.seasonality, data_start_time, cache.time_step ) segment_data = pd.Series(segment.data) lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER) detected_segments = list(self.detections_generator(dataframe, upper_bound, lower_bound, enabled_bounds)) last_dataframe_time = dataframe.iloc[-1]['timestamp'] last_detection_time = utils.convert_pd_timestamp_to_ms(last_dataframe_time) return DetectionResult(cache.to_json(), detected_segments, last_detection_time) def consume_data(self, data: pd.DataFrame, cache: Optional[ModelCache]) -> Optional[DetectionResult]: if cache is None: msg = f'consume_data got invalid cache {cache} for task {self.analytic_unit_id}' logging.debug(msg) raise ValueError(msg) data_without_nan = data.dropna() if len(data_without_nan) == 0: return None self.bucket.receive_data(data_without_nan) if len(self.bucket.data) >= self.get_window_size(cache): return self.detect(self.bucket.data, cache) return None def is_detection_intersected(self) -> bool: return False def get_window_size(self, cache: Optional[ModelCache]) -> int: ''' get the number of values that will affect the next value ''' if cache is None: raise ValueError('anomaly detector got None cache') cache = AnomalyCache.from_json(cache) for level in range(1, MAX_DEPENDENCY_LEVEL): if (1 - cache.alpha) ** level < MIN_DEPENDENCY_FACTOR: break seasonality = 0 if len(cache.segments) > 0: seasonality = cache.seasonality // cache.time_step return max(level, seasonality) def concat_detection_results(self, detections: List[DetectionResult]) -> DetectionResult: result = DetectionResult() time_step = detections[0].cache['timeStep'] for detection in detections: result.segments.extend(detection.segments) result.last_detection_time = detection.last_detection_time result.cache = detection.cache result.segments = utils.merge_intersecting_segments(result.segments, time_step) return result # TODO: remove duplication with detect() def process_data(self, dataframe: pd.DataFrame, cache: ModelCache) -> ProcessingResult: cache = AnomalyCache.from_json(cache) segments = cache.segments enabled_bounds = cache.get_enabled_bounds() # TODO: exponential_smoothing should return dataframe with related timestamps smoothed_data = utils.exponential_smoothing(dataframe['value'], cache.alpha) lower_bound = smoothed_data - cache.confidence upper_bound = smoothed_data + cache.confidence if len(segments) > 0: data_start_time = utils.convert_pd_timestamp_to_ms(dataframe['timestamp'][0]) for segment in segments: seasonality_index = cache.seasonality // cache.time_step # TODO: move it to utils and add tests seasonality_offset = self.get_seasonality_offset( segment.from_timestamp, cache.seasonality, data_start_time, cache.time_step ) segment_data = pd.Series(segment.data) lower_bound = self.add_season_to_data(lower_bound, segment_data, seasonality_offset, seasonality_index, Bound.LOWER) upper_bound = self.add_season_to_data(upper_bound, segment_data, seasonality_offset, seasonality_index, Bound.UPPER) # TODO: support multiple segments timestamps = utils.convert_series_to_timestamp_list(dataframe.timestamp) lower_bound_timeseries = list(zip(timestamps, lower_bound.values.tolist())) upper_bound_timeseries = list(zip(timestamps, upper_bound.values.tolist())) if enabled_bounds == Bound.ALL: return ProcessingResult(lower_bound_timeseries, upper_bound_timeseries) elif enabled_bounds == Bound.UPPER: return ProcessingResult(upper_bound = upper_bound_timeseries) elif enabled_bounds == Bound.LOWER: return ProcessingResult(lower_bound = lower_bound_timeseries) def add_season_to_data(self, data: pd.Series, segment: pd.Series, offset: int, seasonality: int, bound_type: Bound) -> pd.Series: #data - smoothed data to which seasonality will be added #if addition == True -> segment is added #if addition == False -> segment is subtracted len_smoothed_data = len(data) for idx, _ in enumerate(data): if idx - offset < 0: #TODO: add seasonality for non empty parts continue if (idx - offset) % seasonality == 0: if bound_type == Bound.UPPER: upper_segment_bound = self.get_bounds_for_segment(segment)[0] data = data.add(pd.Series(upper_segment_bound.values, index = segment.index + idx), fill_value = 0) elif bound_type == Bound.LOWER: lower_segment_bound = self.get_bounds_for_segment(segment)[1] data = data.add(pd.Series(lower_segment_bound.values * -1, index = segment.index + idx), fill_value = 0) else: raise ValueError(f'unknown bound type: {bound_type.value}') return data[:len_smoothed_data] def get_bounds_for_segment(self, segment: pd.Series) -> Tuple[pd.Series, pd.Series]: ''' segment is divided by the median to determine its top and bottom parts parts are smoothed and raised so the segment is between them ''' if len(segment) < 2: return segment, segment segment = segment - segment.min() segment_median = segment.median() top_part = [] bottom_part = [] for val in segment.values: if val > segment_median: top_part.append(val) bottom_part.append(segment_median) else: bottom_part.append(val) top_part.append(segment_median) top_part = pd.Series(top_part, index = segment.index) bottom_part = pd.Series(bottom_part, index = segment.index) smoothed_top_part = utils.exponential_smoothing(top_part, BASIC_ALPHA) smoothed_bottom_part = utils.exponential_smoothing(bottom_part, BASIC_ALPHA) top_difference = [] bottom_difference = [] for idx, val in enumerate(top_part): top_difference.append(abs(val - smoothed_top_part[idx])) bottom_difference.append(abs(bottom_part[idx] - smoothed_bottom_part[idx])) max_diff_top = max(top_difference) max_diff_bot = max(bottom_difference) upper_bound = [] lower_bound = [] for val in smoothed_top_part.values: upper_bound.append(val + max_diff_top) for val in smoothed_bottom_part.values: lower_bound.append(val + max_diff_bot) upper_bound = pd.Series(upper_bound, index = segment.index) lower_bound = pd.Series(lower_bound, index = segment.index) return upper_bound, lower_bound def get_seasonality_offset(self, from_timestamp: int, seasonality: int, data_start_time: int, time_step: int) -> int: season_count = math.ceil(abs(from_timestamp - data_start_time) / seasonality) start_seasonal_segment = from_timestamp + seasonality * season_count seasonality_time_offset = abs(start_seasonal_segment - data_start_time) % seasonality seasonality_offset = math.ceil(seasonality_time_offset / time_step) return seasonality_offset def detections_generator( self, dataframe: pd.DataFrame, upper_bound: pd.DataFrame, lower_bound: pd.DataFrame, enabled_bounds: Bound ) -> Generator[Segment, None, Segment]: in_segment = False segment_start = 0 bound: Bound = None for idx, val in enumerate(dataframe['value'].values): if val > upper_bound.values[idx]: if enabled_bounds == Bound.UPPER or enabled_bounds == Bound.ALL: if not in_segment: in_segment = True segment_start = dataframe['timestamp'][idx] bound = Bound.UPPER continue if val < lower_bound.values[idx]: if enabled_bounds == Bound.LOWER or enabled_bounds == Bound.ALL: if not in_segment: in_segment = True segment_start = dataframe['timestamp'][idx] bound = Bound.LOWER continue if in_segment: segment_end = dataframe['timestamp'][idx - 1] yield Segment( utils.convert_pd_timestamp_to_ms(segment_start), utils.convert_pd_timestamp_to_ms(segment_end), message=f'{val} out of {str(bound.value)} bound' ) in_segment = False else: if in_segment: segment_end = dataframe['timestamp'][idx] return Segment( utils.convert_pd_timestamp_to_ms(segment_start), utils.convert_pd_timestamp_to_ms(segment_end), message=f'{val} out of {str(bound.value)} bound' )