def test_ncs_reader_kwargs_contains_remove_status_codes(remove_status_codes):
    # Creates a DataLakeProvider with remove_status_codes as kwargs
    data_provider = DataLakeProvider(
        interactive=False, remove_status_codes=remove_status_codes
    )

    # Set the data_provider's client to the AzureDLFileSystemMock as interactive can be False.
    data_provider.client = AzureDLFileSystemMock()
    # Get the ncs_reader from data_provider.
    ncs_reader = data_provider._get_sub_dataproviders()[0]

    # Cheks that the kwargs remove_status_codes has been passed to the sub_provider
    expected = [] if remove_status_codes == [] else [0]
    assert ncs_reader.remove_status_codes == expected
Beispiel #2
0
def _get_default_dataset_config():
    from_ts = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00")
    to_ts = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00")
    return {
        "type": "TimeSeriesDataset",
        "from_ts": from_ts,
        "to_ts": to_ts,
        "tag_list": normalize_sensor_tags(["TRC-FIQ -39-0706", "GRA-EM-23-0003ARV.PV"]),
        "data_provider": DataLakeProvider(),
    }
Beispiel #3
0
def build(output_dir, model_config, data_config, metadata, model_register_dir):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    output_dir: str
        Directory to save model & metadata to.
    model_config: dict
        kwargs to be used in initializing the model. Should also
        contain kwarg 'type' which references the model to use. ie. KerasAutoEncoder
    data_config: dict
        kwargs to be used in intializing the dataset. Should also
        contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset
    metadata: dict
        Any additional metadata to save under the key 'user-defined'
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    """

    # TODO: Move all data related input from environment variable to data_config,
    # TODO: thereby removing all these data_config['variable'] lines

    data_config["tag_list"] = data_config.pop("tags")

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["from_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_start_date"))

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["to_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_end_date"))

    # Set default data provider for data config
    data_config["data_provider"] = DataLakeProvider()
    asset = data_config.get("asset", None)
    tag_list = normalize_sensor_tags(data_config["tag_list"], asset)

    data_config["tag_list"] = tag_list

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Model config: {model_config}")
    logger.info(f"Data config: {data_config}")
    logger.info(f"Register dir: {model_register_dir}")

    model_location = provide_saved_model(model_config, data_config, metadata,
                                         output_dir, model_register_dir)
    with open("/tmp/model-location.txt", "w") as f:
        f.write(model_location)
    return 0
Beispiel #4
0
def test_get_data_serviceauth_fail(caplog):
    from_ts = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00")
    to_ts = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00")

    dataset_config = _get_default_dataset_config()
    dataset_config["from_ts"] = from_ts
    dataset_config["to_ts"] = to_ts
    dataset_config["data_provider"] = DataLakeProvider(
        dl_service_auth_str="TENTANT_UNKNOWN:BOGUS:PASSWORD"
    )

    dl_backed = dataset._get_dataset(dataset_config)

    with pytest.raises(adal.adal_error.AdalError), caplog.at_level(logging.CRITICAL):
        dl_backed.get_data()
    def test_faked_DataLakeBackedDataset(self, _mocked_method):

        config = dict(
            from_ts=dateutil.parser.isoparse("2014-07-01T00:10:00+00:00"),
            to_ts=dateutil.parser.isoparse("2015-01-01T00:00:00+00:00"),
            tag_list=[
                "asgb.19ZT3950%2FY%2FPRIM",
                "asgb.19PST3925%2FDispMeasOut%2FPRIM",
            ],
        )

        provider = DataLakeProvider(storename="dataplatformdlsprod", interactive=True)
        dataset = TimeSeriesDataset(data_provider=provider, **config)

        # Should be able to call get_data without being asked to authenticate in tests
        X, y = dataset.get_data()
Beispiel #6
0
def test_get_data_serviceauth_in_config():
    dataset_config = _get_default_dataset_config()
    dataset_config["data_provider"] = DataLakeProvider(
        dl_service_auth_str=os.getenv("TEST_SERVICE_AUTH")
    )
    dataset_config["resolution"] = "10T"
    dl_backed = dataset._get_dataset(dataset_config)
    data, _ = dl_backed.get_data()

    assert dataset_config["tag_list"] == list(data.columns.values)

    expected_rows = 7
    assert (
        len(data) == expected_rows
    ), f"Default resolution 10 minutes should give {expected_rows} rows"

    assert (
        not data.isnull().values.any()
    ), "Resulting dataframe should not have any NaNs"
Beispiel #7
0
def build(
    name,
    output_dir,
    model_config,
    data_config,
    metadata,
    model_register_dir,
    print_cv_scores,
    model_parameter,
    model_location_file,
    data_provider_threads,
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    name: str
        Name given to the model to build
    output_dir: str
        Directory to save model & metadata to.
    model_config: str
        String containing a yaml which will be parsed to a dict which will be used in
        initializing the model. Should also contain key 'type' which references the
        model to use. ie. KerasAutoEncoder
    data_config: dict
        kwargs to be used in intializing the dataset. Should also
        contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset
    metadata: dict
        Any additional metadata to save under the key 'user-defined'
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.
    model_location_file: str/path
        Path to a file to open and write the location of the serialized model to.
    data_provider_threads: int
        Number of threads to use for the data provider when fetching data.
    """

    # TODO: Move all data related input from environment variable to data_config,
    # TODO: thereby removing all these data_config['variable'] lines

    data_config["tag_list"] = data_config.pop("tags")

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["from_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_start_date")
    )

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date"))

    # Set default data provider for data config
    data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads)
    asset = data_config.get("asset", None)
    tag_list = normalize_sensor_tags(data_config["tag_list"], asset)

    data_config["tag_list"] = tag_list

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Raw model config: {model_config}")
    logger.info(f"Data config: {data_config}")
    logger.info(f"Register dir: {model_register_dir}")

    model_parameter = dict(model_parameter)
    model_config = expand_model(model_config, model_parameter)
    model_config = yaml.full_load(model_config)

    # Convert the config into a pipeline, and back into definition to ensure
    # all default parameters are part of the config.
    logger.debug(f"Ensuring the passed model config is fully expanded.")
    model_config = pipeline_into_definition(pipeline_from_definition(model_config))

    model_location = provide_saved_model(
        name, model_config, data_config, metadata, output_dir, model_register_dir
    )
    # If the model is cached but without CV scores then we force a rebuild. We do this
    # by deleting the entry in the cache and then rerun `provide_saved_model`
    # (leaving the old model laying around)
    if print_cv_scores:
        saved_metadata = load_metadata(model_location)
        all_scores = get_all_score_strings(saved_metadata)
        if not all_scores:
            logger.warning(
                "Found that loaded model does not have cross validation values "
                "even though we were asked to print them, clearing cache and "
                "rebuilding model"
            )

            model_location = provide_saved_model(
                name,
                model_config,
                data_config,
                metadata,
                output_dir,
                model_register_dir,
                replace_cache=True,
            )
            saved_metadata = load_metadata(model_location)
            all_scores = get_all_score_strings(saved_metadata)

        for score in all_scores:
            print(score)

    # Write out the model location to this file.
    model_location_file.write(model_location)
    return 0
Beispiel #8
0
def test_get_data_interactive():
    dataset_config = _get_default_dataset_config()
    dataset_config["data_provider"] = DataLakeProvider(interactive=True)
    dl_backed = dataset._get_dataset(dataset_config)
    data = dl_backed.get_data()
    assert len(data) >= 0