def _get_default_dataset_config(): from_ts = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00") to_ts = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00") return { "type": "TimeSeriesDataset", "from_ts": from_ts, "to_ts": to_ts, "tag_list": normalize_sensor_tags(["TRC-FIQ -39-0706", "GRA-EM-23-0003ARV.PV"]), "data_provider": DataLakeProvider(), }
def build(output_dir, model_config, data_config, metadata, model_register_dir): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- output_dir: str Directory to save model & metadata to. model_config: dict kwargs to be used in initializing the model. Should also contain kwarg 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild """ # TODO: Move all data related input from environment variable to data_config, # TODO: thereby removing all these data_config['variable'] lines data_config["tag_list"] = data_config.pop("tags") # TODO: Move parsing from here, into the InfluxDataSet class data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date")) # TODO: Move parsing from here, into the InfluxDataSet class data_config["to_ts"] = dateutil.parser.isoparse( data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = DataLakeProvider() asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_location = provide_saved_model(model_config, data_config, metadata, output_dir, model_register_dir) with open("/tmp/model-location.txt", "w") as f: f.write(model_location) return 0
def test_ncs_reader_kwargs_contains_remove_status_codes(remove_status_codes): # Creates a DataLakeProvider with remove_status_codes as kwargs data_provider = DataLakeProvider( interactive=False, remove_status_codes=remove_status_codes ) # Set the data_provider's client to the AzureDLFileSystemMock as interactive can be False. data_provider.client = AzureDLFileSystemMock() # Get the ncs_reader from data_provider. ncs_reader = data_provider._get_sub_dataproviders()[0] # Cheks that the kwargs remove_status_codes has been passed to the sub_provider expected = [] if remove_status_codes == [] else [0] assert ncs_reader.remove_status_codes == expected
def test_get_data_serviceauth_fail(caplog): from_ts = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00") to_ts = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00") dataset_config = _get_default_dataset_config() dataset_config["from_ts"] = from_ts dataset_config["to_ts"] = to_ts dataset_config["data_provider"] = DataLakeProvider( dl_service_auth_str="TENTANT_UNKNOWN:BOGUS:PASSWORD" ) dl_backed = dataset._get_dataset(dataset_config) with pytest.raises(adal.adal_error.AdalError), caplog.at_level(logging.CRITICAL): dl_backed.get_data()
def test_faked_DataLakeBackedDataset(self, _mocked_method): config = dict( from_ts=dateutil.parser.isoparse("2014-07-01T00:10:00+00:00"), to_ts=dateutil.parser.isoparse("2015-01-01T00:00:00+00:00"), tag_list=[ "asgb.19ZT3950%2FY%2FPRIM", "asgb.19PST3925%2FDispMeasOut%2FPRIM", ], ) provider = DataLakeProvider(storename="dataplatformdlsprod", interactive=True) dataset = TimeSeriesDataset(data_provider=provider, **config) # Should be able to call get_data without being asked to authenticate in tests X, y = dataset.get_data()
def test_get_data_serviceauth_in_config(): dataset_config = _get_default_dataset_config() dataset_config["data_provider"] = DataLakeProvider( dl_service_auth_str=os.getenv("TEST_SERVICE_AUTH") ) dataset_config["resolution"] = "10T" dl_backed = dataset._get_dataset(dataset_config) data, _ = dl_backed.get_data() assert dataset_config["tag_list"] == list(data.columns.values) expected_rows = 7 assert ( len(data) == expected_rows ), f"Default resolution 10 minutes should give {expected_rows} rows" assert ( not data.isnull().values.any() ), "Resulting dataframe should not have any NaNs"
def build( name, output_dir, model_config, data_config, metadata, model_register_dir, print_cv_scores, model_parameter, model_location_file, data_provider_threads, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. model_location_file: str/path Path to a file to open and write the location of the serialized model to. data_provider_threads: int Number of threads to use for the data provider when fetching data. """ # TODO: Move all data related input from environment variable to data_config, # TODO: thereby removing all these data_config['variable'] lines data_config["tag_list"] = data_config.pop("tags") # TODO: Move parsing from here, into the InfluxDataSet class data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date") ) # TODO: Move parsing from here, into the InfluxDataSet class data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads) asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition(pipeline_from_definition(model_config)) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir ) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model" ) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) # Write out the model location to this file. model_location_file.write(model_location) return 0
def test_get_data_interactive(): dataset_config = _get_default_dataset_config() dataset_config["data_provider"] = DataLakeProvider(interactive=True) dl_backed = dataset._get_dataset(dataset_config) data = dl_backed.get_data() assert len(data) >= 0