def test_normalize_sensor_tags_not_ok(): with pytest.raises(SensorTagNormalizationError): tag_list_as_list_of_strings_nonsense = [ NON_RESOLVABLE_TAG_NAME1, NON_RESOLVABLE_TAG_NAME2, ] normalize_sensor_tags(tag_list_as_list_of_strings_nonsense)
def test_load_series_dry_run(dates, ncs_reader): valid_tag_list_no_asset = normalize_sensor_tags(["TRC-123", "TRC-321"]) for frame in ncs_reader.load_series(dates[0], dates[1], valid_tag_list_no_asset, dry_run=True): assert len(frame) == 0
def dataset_config(mock_file_system, mock_assets_config): train_start_date = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00") train_end_date = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00") return { "type": "TimeSeriesDataset", "train_start_date": train_start_date, "train_end_date": train_end_date, "tag_list": normalize_sensor_tags(["TRC-FIQ -39-0706", "GRA-EM-23-0003ARV.PV"]), "data_provider": DataLakeProvider( storage=mock_file_system, assets_config=mock_assets_config ), }
def tags(self) -> typing.List[SensorTag]: """ The input tags for this model Returns ------- typing.List[SensorTag] """ return normalize_sensor_tags( g.metadata["dataset"]["tag_list"], asset=g.metadata["dataset"].get("asset"), default_asset=g.metadata["dataset"].get("default_asset"), )
def test_with_conflicted_file_types_with_preferable_csv(dates, assets_config): ncs_reader = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, remove_status_codes=[0], lookup_for=["csv"], partition_by="year", ) valid_tag_list = normalize_sensor_tags(["TRC-324"]) series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list) tags_series = [v for v in series_gen] assert len(tags_series) == 1 trc_324_series = tags_series[0] # CSV file should be with 1 row assert len(trc_324_series) == 1
def test_parquet_files_lookup(dates, assets_config): ncs_reader = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, remove_status_codes=[0], lookup_for=["yearly_parquet", "csv"], partition_by=PartitionBy.YEAR, ) valid_tag_list = normalize_sensor_tags(["TRC-323"]) series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list) tags_series = [v for v in series_gen] assert len(tags_series) == 1 trc_323_series = tags_series[0] assert trc_323_series.name == "TRC-323" assert trc_323_series.dtype.name == "float64" assert len(trc_323_series) == 20
def test_load_series_with_filter_bad_data(dates, remove_status_codes, assets_config): ncs_reader = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, remove_status_codes=remove_status_codes, lookup_for=["yearly_parquet", "csv"], partition_by=PartitionBy.YEAR, ) valid_tag_list = normalize_sensor_tags(["TRC-322"]) series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list) # Checks if the bad data from the files under tests/gordo/data_provider/data/datalake/TRC-322 # are filtered out. 20 rows exists, 5 of then have the value 0. n_expected = 15 if remove_status_codes != [] else 20 assert all(len(series) == n_expected for series in series_gen)
def test_monthly_parquet(dates, assets_config): ncs_reader = NcsReader( ADLGen1FileSystem(AzureDLFileSystemMock(), "adl1"), assets_config=assets_config, ) valid_tag_list = normalize_sensor_tags(["TRC-325"]) series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list) tags_series = [v for v in series_gen] assert len(tags_series) == 1 index = tags_series[0].index assert len(index) == 20 dr1 = pd.date_range(start="2001-05-10T00:00:00+00:00", periods=10, freq="1T") dr2 = pd.date_range(start="2001-06-10T00:00:00+00:00", periods=10, freq="1T") dr = dr1.append(dr2) assert index.equals(dr)
def target_tags(self) -> typing.List[SensorTag]: """ The target tags for this model Returns ------- typing.List[SensorTag] """ # TODO refactor this part to have the same tag preparation logic as in TimeSeriesDataset orig_target_tag_list = [] if "target_tag_list" in g.metadata["dataset"]: orig_target_tag_list = g.metadata["dataset"]["target_tag_list"] if orig_target_tag_list: return normalize_sensor_tags( orig_target_tag_list, asset=g.metadata["dataset"].get("asset"), default_asset=g.metadata["dataset"].get("default_asset"), ) else: return self.tags
def test_can_handle_tag_unknow_prefix_raise(ncs_reader): with pytest.raises(ValueError): ncs_reader.can_handle_tag(normalize_sensor_tags(["XYZ-123"])[0])
def test_load_series_invalid_year(start_date, end_date, frame_len, ncs_reader): valid_tag_list = normalize_sensor_tags(["TRC-123"]) frame = next(ncs_reader.load_series(start_date, end_date, valid_tag_list)) assert len(frame) == frame_len
def test_load_series_known_prefix(dates, ncs_reader): valid_tag_list_no_asset = normalize_sensor_tags(["TRC-123", "TRC-321"]) for frame in ncs_reader.load_series(dates[0], dates[1], valid_tag_list_no_asset): assert len(frame) == 20
lookup_for=["yearly_parquet", "csv"], partition_by=PartitionBy.YEAR, ) @pytest.fixture def dates(): return ( dateutil.parser.isoparse("2000-01-01T08:56:00+00:00"), dateutil.parser.isoparse("2001-09-01T10:01:00+00:00"), ) @pytest.mark.parametrize( "tag_to_check", [normalize_sensor_tags(["TRC-123"])[0], SensorTag("XYZ-123", "1776-TROC")], ) def test_can_handle_tag_ok(tag_to_check, ncs_reader): assert ncs_reader.can_handle_tag(tag_to_check) @pytest.mark.parametrize( "tag_to_check", [SensorTag("TRC-123", None), SensorTag("XYZ-123", "123-XXX")]) def test_can_handle_tag_notok(tag_to_check, ncs_reader): assert not ncs_reader.can_handle_tag(tag_to_check) def test_can_handle_tag_unknow_prefix_raise(ncs_reader):
def test_normalize_sensor_tags_ok(good_input_tags, asset, default_asset, expected_output_tags): tag_list_as_list_of_sensor_tag = normalize_sensor_tags( good_input_tags, asset, default_asset=default_asset) assert tag_list_as_list_of_sensor_tag == expected_output_tags
def get_machine_log_items(machine: Machine) -> Tuple[List[Metric], List[Param]]: """ Create flat lists of MLflow logging entities from multilevel dictionary For more information, see the mlflow docs: https://www.mlflow.org/docs/latest/python_api/mlflow.tracking.html#mlflow.tracking.MlflowClient.log_batch Parameters ---------- machine: Machine Returns ------- metrics: List[Metric] List of MLFlow Metric objects to log. params: List[Param] List of MLFlow Param objects to log. """ metrics: List[Metric] = list() build_metadata = machine.metadata.build_metadata # Project/machine parameters keys = ["project_name", "name"] params = [Param(attr, getattr(machine, attr)) for attr in keys] # Dataset parameters dataset_keys = [ "train_start_date", "train_end_date", "resolution", "row_filter", "row_filter_buffer_size", ] params.extend(Param(k, str(getattr(machine.dataset, k))) for k in dataset_keys) # Model parameters model_keys = ["model_creation_date", "model_builder_version", "model_offset"] params.extend(Param(k, str(getattr(build_metadata.model, k))) for k in model_keys) # Parse cross-validation split metadata splits = build_metadata.model.cross_validation.splits params.extend(Param(k, str(v)) for k, v in splits.items()) # Parse cross-validation metrics tag_list = normalize_sensor_tags( machine.dataset.tag_list, asset=machine.dataset.asset ) scores = build_metadata.model.cross_validation.scores keys = sorted(list(scores.keys())) subkeys = ["mean", "max", "min", "std"] n_folds = len(scores[keys[0]]) - len(subkeys) for k in keys: # Skip per tag data, produces too many params for MLflow if any([t.name in k for t in tag_list]): continue # Summary stats per metric for sk in subkeys: metrics.append(Metric(f"{k}-{sk}", scores[k][f"fold-{sk}"], epoch_now(), 0)) # Append value for each fold with increasing steps metrics.extend( Metric(k, scores[k][f"fold-{i+1}"], epoch_now(), i) for i in range(n_folds) ) # Parse fit metrics try: meta_params = build_metadata.model.model_meta["history"]["params"] except KeyError: logger.debug( "Key 'build-metadata.model.history.params' not found found in metadata." ) else: metrics.extend( Metric(k, float(getattr(build_metadata.model, k)), epoch_now(), 0) for k in ["model_training_duration_sec"] ) for m in meta_params["metrics"]: data = build_metadata.model.model_meta["history"][m] metrics.extend( Metric(m, float(x), timestamp=epoch_now(), step=i) for i, x in enumerate(data) ) params.extend( Param(k, str(meta_params[k])) for k in (p for p in meta_params if p != "metrics") ) return metrics, params
def test_normalize_iroc_tags(): normalized_tags = normalize_sensor_tags(IROC_MANY_ASSETS_TAG_LIST) assert normalized_tags == IROC_MANY_ASSETS_SENSOR_TAG_LIST