def test_filter_periods_median(dataset): data, _ = dataset.get_data() data_filtered, drop_periods, predictions = FilterPeriods( granularity="10T", filter_method="median", n_iqr=1).filter_data(data) assert data.shape == (9063, 1) assert data["Tag 1"].mean() == 0.5113691034704841 assert sum(predictions["median"]["pred"]) == -493 assert len(drop_periods["median"]) == 44 assert data_filtered.shape == (8570, 1)
def test_filter_periods_iforest_smoothing(dataset): data, _ = dataset.get_data() data_filtered, drop_periods, predictions = FilterPeriods( granularity="10T", filter_method="iforest", iforest_smooth=True).filter_data(data) assert data.shape == (9674, 1) assert data["Tag 1"].mean() == 0.5019862352609169 assert sum(predictions["iforest"]["pred"]) == 8552 assert len(drop_periods["iforest"]) == 41 assert data_filtered.shape == (9113, 1)
def test_filter_periods_iforest(dataset): data, _ = dataset.get_data() data_filtered, drop_periods, predictions = FilterPeriods( granularity="10T", filter_method="iforest", iforest_smooth=False).filter_data(data) assert data.shape == (12838, 1) assert data["Tag 1"].mean() == 0.5144733352386245 assert sum(predictions["iforest"]["pred"]) == 12066 assert len(drop_periods["iforest"]) == 61 assert data_filtered.shape == (12452, 1)
def test_filter_periods_all_smoothing(dataset): data, _ = dataset.get_data() data_filtered, drop_periods, predictions = FilterPeriods( granularity="10T", filter_method="all", n_iqr=1, iforest_smooth=True).filter_data(data) assert data.shape == (8595, 1) assert data["Tag 1"].mean() == 0.512856120233814 assert sum(predictions["iforest"]["pred"]) == 7471 assert len(drop_periods["median"]) == 39 assert len(drop_periods["iforest"]) == 29 assert data_filtered.shape == (7522, 1)
def test_filter_periods_all(dataset): data, _ = dataset.get_data() data_filtered, drop_periods, predictions = FilterPeriods( granularity="10T", filter_method="all", n_iqr=1, iforest_smooth=False).filter_data(data) assert data.shape == (8024, 1) assert data["Tag 1"].mean() == 0.500105748646813 assert sum(predictions["median"]["pred"]) == -449 assert sum(predictions["iforest"]["pred"]) == 7542 assert len(drop_periods["median"]) == 39 assert len(drop_periods["iforest"]) == 29 assert data_filtered.shape == (7356, 1)
def test_filter_periods_typerror(dataset): data, _ = dataset.get_data() assert data.shape == (9760, 1) with pytest.raises(TypeError): FilterPeriods(granularity="10T", filter_method="abc", n_iqr=1)
def __init__( self, train_start_date: Union[datetime, str], train_end_date: Union[datetime, str], tag_list: Sequence[Union[str, Dict, SensorTag]], target_tag_list: Optional[Sequence[Union[str, Dict, SensorTag]]] = None, data_provider: Union[GordoBaseDataProvider, dict] = DataLakeProvider(), resolution: Optional[str] = "10T", row_filter: Union[str, list] = "", known_filter_periods: Optional[list] = [], aggregation_methods: Union[str, List[str], Callable] = "mean", row_filter_buffer_size: int = 0, asset: Optional[str] = None, default_asset: Optional[str] = None, n_samples_threshold: int = 0, low_threshold: Optional[int] = -1000, high_threshold: Optional[int] = 50000, interpolation_method: str = "linear_interpolation", interpolation_limit: str = "8H", filter_periods: Optional[dict] = {}, tag_normalizer: Union[str, Callable[..., List[SensorTag]]] = "default", ): """ Creates a TimeSeriesDataset backed by a provided dataprovider. A TimeSeriesDataset is a dataset backed by timeseries, but resampled, aligned, and (optionally) filtered. Parameters ---------- train_start_date: Union[datetime, str] Earliest possible point in the dataset (inclusive) train_end_date: Union[datetime, str] Earliest possible point in the dataset (exclusive) tag_list: Sequence[Union[str, Dict, sensor_tag.SensorTag]] List of tags to include in the dataset. The elements can be strings, dictionaries or SensorTag namedtuples. target_tag_list: Sequence[List[Union[str, Dict, sensor_tag.SensorTag]]] List of tags to set as the dataset y. These will be treated the same as tag_list when fetching and pre-processing (resampling) but will be split into the y return from ``.get_data()`` data_provider: Union[GordoBaseDataProvider, dict] A dataprovider which can provide dataframes for tags from train_start_date to train_end_date of which can also be a config definition from a data provider's ``.to_dict()`` method. resolution: Optional[str] The bucket size for grouping all incoming time data (e.g. "10T"). Available strings come from https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects **Note**: If this parameter is ``None`` or ``False``, then _no_ aggregation/resampling is applied to the data. row_filter: str or list Filter on the rows. Only rows satisfying the filter will be in the dataset. See :func:`gordo.machine.dataset.filter_rows.pandas_filter_rows` for further documentation of the filter format. known_filter_periods: list List of periods to drop in the format [~('2020-04-08 04:00:00+00:00' < index < '2020-04-08 10:00:00+00:00')]. Note the time-zone suffix (+00:00), which is required. aggregation_methods Aggregation method(s) to use for the resampled buckets. If a single resample method is provided then the resulting dataframe will have names identical to the names of the series it got in. If several aggregation-methods are provided then the resulting dataframe will have a multi-level column index, with the series-name as the first level, and the aggregation method as the second level. See :py:func::`pandas.core.resample.Resampler#aggregate` for more information on possible aggregation methods. row_filter_buffer_size: int Whatever elements are selected for removal based on the ``row_filter``, will also have this amount of elements removed fore and aft. Default is zero 0 asset: Optional[str] Asset for which the tags are associated with. default_asset: Optional[str] Asset which will be used if `asset` is not provided and the tag is not resolvable to a specific asset. n_samples_threshold: int = 0 The threshold at which the generated DataFrame is considered to have too few rows of data. interpolation_method: str How should missing values be interpolated. Either forward fill (`ffill`) or by linear interpolation (default, `linear_interpolation`). interpolation_limit: str Parameter sets how long from last valid data point values will be interpolated/forward filled. Default is eight hours (`8H`). If None, all missing values are interpolated/forward filled. fiter_periods: dict Performs a series of algorithms that drops noisy data is specified. See `filter_periods` class for details. tag_normalizer: Union[str, Callable[..., List[SensorTag]]] `default` is only one suitable value for now, uses ``gordo.machine.dataset.sensor_tag.normalize_sensor_tags`` in this case """ self.train_start_date = self._validate_dt(train_start_date) self.train_end_date = self._validate_dt(train_end_date) if self.train_start_date >= self.train_end_date: raise ValueError( f"train_end_date ({self.train_end_date}) must be after train_start_date ({self.train_start_date})" ) if isinstance(tag_normalizer, str): if tag_normalizer not in self.TAG_NORMALIZERS: raise ValueError( "Unsupported tag_normalizer type '%s'" % tag_normalizer ) tag_normalizer = self.TAG_NORMALIZERS[tag_normalizer] self.tag_normalizer = tag_normalizer self.asset = asset self.default_asset = default_asset self.tag_list = self.tag_normalizer(list(tag_list), asset, default_asset) self.target_tag_list = ( self.tag_normalizer(list(target_tag_list), asset, default_asset) if target_tag_list else self.tag_list.copy() ) self.resolution = resolution self.data_provider = ( data_provider if not isinstance(data_provider, dict) else GordoBaseDataProvider.from_dict(data_provider) ) self.row_filter = row_filter self.aggregation_methods = aggregation_methods self.row_filter_buffer_size = row_filter_buffer_size self.n_samples_threshold = n_samples_threshold self.low_threshold = low_threshold self.high_threshold = high_threshold self.interpolation_method = interpolation_method self.interpolation_limit = interpolation_limit self.filter_periods = ( FilterPeriods(granularity=self.resolution, **filter_periods) if filter_periods else None ) self.known_filter_periods = known_filter_periods if not self.train_start_date.tzinfo or not self.train_end_date.tzinfo: raise ValueError( f"Timestamps ({self.train_start_date}, {self.train_end_date}) need to include timezone " f"information" ) super().__init__()