Beispiel #1
0
def test_normalize_sensor_tags_not_ok():
    with pytest.raises(SensorTagNormalizationError):
        tag_list_as_list_of_strings_nonsense = [
            NON_RESOLVABLE_TAG_NAME1,
            NON_RESOLVABLE_TAG_NAME2,
        ]
        normalize_sensor_tags(tag_list_as_list_of_strings_nonsense)
Beispiel #2
0
def test_load_series_dry_run(dates, ncs_reader):
    valid_tag_list_no_asset = normalize_sensor_tags(["TRC-123", "TRC-321"])
    for frame in ncs_reader.load_series(dates[0],
                                        dates[1],
                                        valid_tag_list_no_asset,
                                        dry_run=True):
        assert len(frame) == 0
Beispiel #3
0
def test_normalize_sensor_tags_ok(
    good_input_tags, asset, default_asset, expected_output_tags
):
    tag_list_as_list_of_sensor_tag = normalize_sensor_tags(
        good_input_tags, asset, default_asset=default_asset
    )
    assert tag_list_as_list_of_sensor_tag == expected_output_tags
Beispiel #4
0
def test_with_conflicted_file_types(dates):
    ncs_reader = NcsReader(AzureDLFileSystemMock(), remove_status_codes=[0])

    valid_tag_list = normalize_sensor_tags(["TRC-324"])
    series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list)
    tags_series = [v for v in series_gen]
    assert len(tags_series) == 1
    trc_324_series = tags_series[0]
    # Parquet file should be with 15 rows
    assert len(trc_324_series) == 15
Beispiel #5
0
def test_parquet_files_lookup(dates):
    ncs_reader = NcsReader(AzureDLFileSystemMock(), remove_status_codes=[0])

    valid_tag_list = normalize_sensor_tags(["TRC-323"])
    series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list)
    tags_series = [v for v in series_gen]
    assert len(tags_series) == 1
    trc_323_series = tags_series[0]
    assert trc_323_series.name == "TRC-323"
    assert trc_323_series.dtype.name == "float64"
    assert len(trc_323_series) == 20
Beispiel #6
0
def test_load_series_with_filter_bad_data(dates, remove_status_codes):

    ncs_reader = NcsReader(AzureDLFileSystemMock(),
                           remove_status_codes=remove_status_codes)

    valid_tag_list = normalize_sensor_tags(["TRC-322"])
    series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list)
    # Checks if the bad data from the files under tests/gordo/data_provider/data/datalake/TRC-322
    # are filtered out. 20 rows exists, 5 of then have the value 0.

    n_expected = 15 if remove_status_codes != [] else 20
    assert all(len(series) == n_expected for series in series_gen)
Beispiel #7
0
def test_with_conflicted_file_types_with_preferable_csv(dates):
    ncs_reader = NcsReader(AzureDLFileSystemMock(),
                           remove_status_codes=[0],
                           lookup_for=["csv"])

    valid_tag_list = normalize_sensor_tags(["TRC-324"])
    series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list)
    tags_series = [v for v in series_gen]
    assert len(tags_series) == 1
    trc_324_series = tags_series[0]
    # CSV file should be with 1 row
    assert len(trc_324_series) == 1
Beispiel #8
0
    def tags(self) -> typing.List[SensorTag]:
        """
        The input tags for this model

        Returns
        -------
        typing.List[SensorTag]
        """
        return normalize_sensor_tags(
            g.metadata["dataset"]["tag_list"],
            asset=g.metadata["dataset"].get("asset"),
            default_asset=g.metadata["dataset"].get("default_asset"),
        )
def _get_default_dataset_config():
    train_start_date = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00")
    train_end_date = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00")
    return {
        "type":
        "TimeSeriesDataset",
        "train_start_date":
        train_start_date,
        "train_end_date":
        train_end_date,
        "tag_list":
        normalize_sensor_tags(["TRC-FIQ -39-0706", "GRA-EM-23-0003ARV.PV"]),
        "data_provider":
        DataLakeProvider(),
    }
def test_normalize_iroc_tags():
    normalized_tags = normalize_sensor_tags(IROC_MANY_ASSETS_TAG_LIST)
    assert normalized_tags == IROC_MANY_ASSETS_SENSOR_TAG_LIST
Beispiel #11
0
def test_can_handle_tag_unknow_prefix_raise(ncs_reader):
    with pytest.raises(ValueError):
        ncs_reader.can_handle_tag(normalize_sensor_tags(["XYZ-123"])[0])
Beispiel #12
0
def test_load_series_invalid_year(start_date, end_date, frame_len, ncs_reader):
    valid_tag_list = normalize_sensor_tags(["TRC-123"])
    frame = next(ncs_reader.load_series(start_date, end_date, valid_tag_list))
    assert len(frame) == frame_len
Beispiel #13
0
def test_load_series_known_prefix(dates, ncs_reader):
    valid_tag_list_no_asset = normalize_sensor_tags(["TRC-123", "TRC-321"])
    for frame in ncs_reader.load_series(dates[0], dates[1],
                                        valid_tag_list_no_asset):
        assert len(frame) == 20
Beispiel #14
0
@pytest.fixture
def ncs_reader():
    return NcsReader(AzureDLFileSystemMock())


@pytest.fixture
def dates():
    return (
        dateutil.parser.isoparse("2000-01-01T08:56:00+00:00"),
        dateutil.parser.isoparse("2001-09-01T10:01:00+00:00"),
    )


@pytest.mark.parametrize(
    "tag_to_check",
    [normalize_sensor_tags(["TRC-123"])[0],
     SensorTag("XYZ-123", "1776-TROC")],
)
def test_can_handle_tag_ok(tag_to_check, ncs_reader):
    assert ncs_reader.can_handle_tag(tag_to_check)


@pytest.mark.parametrize(
    "tag_to_check",
    [SensorTag("TRC-123", None),
     SensorTag("XYZ-123", "123-XXX")])
def test_can_handle_tag_notok(tag_to_check, ncs_reader):
    assert not ncs_reader.can_handle_tag(tag_to_check)


def test_can_handle_tag_unknow_prefix_raise(ncs_reader):
Beispiel #15
0
def get_machine_log_items(
        machine: Machine) -> Tuple[List[Metric], List[Param]]:
    """
    Create flat lists of MLflow logging entities from multilevel dictionary

    For more information, see the mlflow docs:
    https://www.mlflow.org/docs/latest/python_api/mlflow.tracking.html#mlflow.tracking.MlflowClient.log_batch

    Parameters
    ----------
    machine: Machine

    Returns
    -------
    metrics: List[Metric]
        List of MLFlow Metric objects to log.
    params: List[Param]
        List of MLFlow Param objects to log.
    """

    metrics: List[Metric] = list()
    build_metadata = machine.metadata.build_metadata

    # Project/machine parameters
    keys = ["project_name", "name"]
    params = [Param(attr, getattr(machine, attr)) for attr in keys]

    # Dataset parameters
    dataset_keys = [
        "train_start_date",
        "train_end_date",
        "resolution",
        "row_filter",
        "row_filter_buffer_size",
    ]
    params.extend(
        Param(k, str(getattr(machine.dataset, k))) for k in dataset_keys)

    # Model parameters
    model_keys = [
        "model_creation_date", "model_builder_version", "model_offset"
    ]
    params.extend(
        Param(k, str(getattr(build_metadata.model, k))) for k in model_keys)

    # Parse cross-validation split metadata
    splits = build_metadata.model.cross_validation.splits
    params.extend(Param(k, str(v)) for k, v in splits.items())

    # Parse cross-validation metrics

    tag_list = normalize_sensor_tags(machine.dataset.tag_list,
                                     asset=machine.dataset.asset)
    scores = build_metadata.model.cross_validation.scores

    keys = sorted(list(scores.keys()))
    subkeys = ["mean", "max", "min", "std"]

    n_folds = len(scores[keys[0]]) - len(subkeys)
    for k in keys:
        # Skip per tag data, produces too many params for MLflow
        if any([t.name in k for t in tag_list]):
            continue

        # Summary stats per metric
        for sk in subkeys:
            metrics.append(
                Metric(f"{k}-{sk}", scores[k][f"fold-{sk}"], epoch_now(), 0))
        # Append value for each fold with increasing steps
        metrics.extend(
            Metric(k, scores[k][f"fold-{i+1}"], epoch_now(), i)
            for i in range(n_folds))

    # Parse fit metrics
    try:
        meta_params = build_metadata.model.model_meta["history"]["params"]
    except KeyError:
        logger.debug(
            "Key 'build-metadata.model.history.params' not found found in metadata."
        )
    else:
        metrics.extend(
            Metric(k, float(getattr(build_metadata.model, k)), epoch_now(), 0)
            for k in ["model_training_duration_sec"])
        for m in meta_params["metrics"]:
            data = build_metadata.model.model_meta["history"][m]
            metrics.extend(
                Metric(m, float(x), timestamp=epoch_now(), step=i)
                for i, x in enumerate(data))
        params.extend(
            Param(k, str(meta_params[k]))
            for k in (p for p in meta_params if p != "metrics"))

    return metrics, params
Beispiel #16
0
    def __init__(
        self,
        train_start_date: Union[datetime, str],
        train_end_date: Union[datetime, str],
        tag_list: Sequence[Union[str, Dict, SensorTag]],
        target_tag_list: Optional[Sequence[Union[str, Dict,
                                                 SensorTag]]] = None,
        data_provider: Union[GordoBaseDataProvider, dict] = DataLakeProvider(),
        resolution: Optional[str] = "10T",
        row_filter: str = "",
        aggregation_methods: Union[str, List[str], Callable] = "mean",
        row_filter_buffer_size: int = 0,
        asset: Optional[str] = None,
        default_asset: Optional[str] = None,
        n_samples_threshold: int = 0,
        low_threshold=-1000,
        high_threshold=50000,
        interpolation_method: str = "linear_interpolation",
        interpolation_limit: str = "8H",
        filter_periods={},
    ):
        """
        Creates a TimeSeriesDataset backed by a provided dataprovider.

        A TimeSeriesDataset is a dataset backed by timeseries, but resampled,
        aligned, and (optionally) filtered.

        Parameters
        ----------
        train_start_date: Union[datetime, str]
            Earliest possible point in the dataset (inclusive)
        train_end_date: Union[datetime, str]
            Earliest possible point in the dataset (exclusive)
        tag_list: Sequence[Union[str, Dict, sensor_tag.SensorTag]]
            List of tags to include in the dataset. The elements can be strings,
            dictionaries or SensorTag namedtuples.
        target_tag_list: Sequence[List[Union[str, Dict, sensor_tag.SensorTag]]]
            List of tags to set as the dataset y. These will be treated the same as
            tag_list when fetching and pre-processing (resampling) but will be split
            into the y return from ``.get_data()``
        data_provider: Union[GordoBaseDataProvider, dict]
            A dataprovider which can provide dataframes for tags from train_start_date to train_end_date
            of which can also be a config definition from a data provider's ``.to_dict()`` method.
        resolution: Optional[str]
            The bucket size for grouping all incoming time data (e.g. "10T").
            Available strings come from https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
            **Note**: If this parameter is ``None`` or ``False``, then _no_ aggregation/resampling is applied to the data.
        row_filter: str
            Filter on the rows. Only rows satisfying the filter will be in the dataset.
            See :func:`gordo.machine.dataset.filter_rows.pandas_filter_rows` for
            further documentation of the filter format.
        aggregation_methods
            Aggregation method(s) to use for the resampled buckets. If a single
            resample method is provided then the resulting dataframe will have names
            identical to the names of the series it got in. If several
            aggregation-methods are provided then the resulting dataframe will
            have a multi-level column index, with the series-name as the first level,
            and the aggregation method as the second level.
            See :py:func::`pandas.core.resample.Resampler#aggregate` for more
            information on possible aggregation methods.
        row_filter_buffer_size: int
            Whatever elements are selected for removal based on the ``row_filter``, will also
            have this amount of elements removed fore and aft.
            Default is zero 0
        asset: Optional[str]
            Asset for which the tags are associated with.
        default_asset: Optional[str]
            Asset which will be used if `asset` is not provided and the tag is not
            resolvable to a specific asset.
        n_samples_threshold: int = 0
            The threshold at which the generated DataFrame is considered to have too few rows of data.
        interpolation_method: str
            How should missing values be interpolated. Either forward fill (`ffill`) or by linear
            interpolation (default, `linear_interpolation`).
        interpolation_limit: str
            Parameter sets how long from last valid data point values will be interpolated/forward filled.
            Default is eight hours (`8H`).
            If None, all missing values are interpolated/forward filled.
        fiter_periods: dict
            Performs a series of algorithms that drops noisy data is specified.
            See `filter_periods` class for details.
        """
        self.train_start_date = self._validate_dt(train_start_date)
        self.train_end_date = self._validate_dt(train_end_date)

        if self.train_start_date >= self.train_end_date:
            raise ValueError(
                f"train_end_date ({self.train_end_date}) must be after train_start_date ({self.train_start_date})"
            )

        self.tag_list = normalize_sensor_tags(list(tag_list), asset,
                                              default_asset)
        self.target_tag_list = (normalize_sensor_tags(list(target_tag_list),
                                                      asset, default_asset)
                                if target_tag_list else self.tag_list.copy())
        self.resolution = resolution
        self.data_provider = (data_provider
                              if not isinstance(data_provider, dict) else
                              GordoBaseDataProvider.from_dict(data_provider))
        self.row_filter = row_filter
        self.aggregation_methods = aggregation_methods
        self.row_filter_buffer_size = row_filter_buffer_size
        self.asset = asset
        self.n_samples_threshold = n_samples_threshold
        self.low_threshold = low_threshold
        self.high_threshold = high_threshold
        self.interpolation_method = interpolation_method
        self.interpolation_limit = interpolation_limit
        self.filter_periods = (FilterPeriods(granularity=self.resolution,
                                             **filter_periods)
                               if filter_periods else None)

        if not self.train_start_date.tzinfo or not self.train_end_date.tzinfo:
            raise ValueError(
                f"Timestamps ({self.train_start_date}, {self.train_end_date}) need to include timezone "
                f"information")