def _endpoints_from_watchman( self, endpoint: str) -> typing.List[EndpointMetadata]: """ Get a list of endpoints by querying Watchman """ resp = requests.get(endpoint) if not resp.ok: raise IOError(f"Failed to get endpoints: {resp.content}") return [ EndpointMetadata( target_name=data["endpoint-metadata"]["metadata"]["name"], healthy=data["healthy"], endpoint=f'{self.base_url}{data["endpoint"].rstrip("/")}', tag_list=normalize_sensor_tags( data["endpoint-metadata"]["metadata"]["dataset"] ["tag_list"]), target_tag_list=normalize_sensor_tags( data["endpoint-metadata"]["metadata"]["dataset"] ["target_tag_list"]), resolution=data["endpoint-metadata"]["metadata"]["dataset"] ["resolution"], model_offset=data["endpoint-metadata"]["metadata"] ["model"].get("model-offset", 0), ) if data["healthy"] else EndpointMetadata( target_name=None, healthy=data["healthy"], endpoint=f'{self.base_url}{data["endpoint"].rstrip("/")}', tag_list=None, target_tag_list=None, resolution=None, model_offset=None, ) for data in resp.json()["endpoints"] ]
def __init__( self, data_provider: GordoBaseDataProvider, from_ts: datetime, to_ts: datetime, tag_list: List[Union[str, Dict, SensorTag]], target_tag_list: Optional[List[Union[str, Dict, SensorTag]]] = None, resolution: str = "10T", row_filter: str = "", **_kwargs, ): """ Creates a TimeSeriesDataset backed by a provided dataprovider. A TimeSeriesDataset is a dataset backed by timeseries, but resampled, aligned, and (optionally) filtered. Parameters ---------- data_provider: GordoBaseDataProvider A dataprovider which can provide dataframes for tags from from_ts to to_ts from_ts: datetime Earliest possible point in the dataset (inclusive) to_ts: datetime Earliest possible point in the dataset (exclusive) tag_list: List[Union[str, Dict, sensor_tag.SensorTag]] List of tags to include in the dataset. The elements can be strings, dictionaries or SensorTag namedtuples. target_tag_list: Optional[List[Union[str, Dict, sensor_tag.SensorTag]]] List of tags to set as the dataset y. These will be treated the same as tag_list when fetching and pre-processing (resampling) but will be split into the y return from ``.get_data()`` resolution: str The bucket size for grouping all incoming time data (e.g. "10T"). row_filter: str Filter on the rows. Only rows satisfying the filter will be in the dataset. See :func:`gordo_components.dataset.filter_rows.pandas_filter_rows` for further documentation of the filter format. _kwargs """ self.from_ts = from_ts self.to_ts = to_ts self.tag_list = normalize_sensor_tags(tag_list) self.target_tag_list = (normalize_sensor_tags(target_tag_list) if target_tag_list else []) self.resolution = resolution self.data_provider = data_provider self.row_filter = row_filter if not self.from_ts.tzinfo or not self.to_ts.tzinfo: raise ValueError( f"Timestamps ({self.from_ts}, {self.to_ts}) need to include timezone " f"information")
def target_tags(self) -> typing.List[SensorTag]: if "target_tag_list" in current_app.metadata["dataset"]: return normalize_sensor_tags( current_app.metadata["dataset"]["target_tag_list"] ) else: return []
def test_load_series_dry_run(dates, ncs_reader): valid_tag_list_no_asset = normalize_sensor_tags(["TRC-123", "TRC-321"]) for frame in ncs_reader.load_series(dates[0], dates[1], valid_tag_list_no_asset, dry_run=True): assert len(frame) == 0
def test_load_series_known_prefix(dates, ncs_reader): valid_tag_list_no_asset = normalize_sensor_tags(["TRC-123", "TRC-321"]) for frame in ncs_reader.load_series(dates[0], dates[1], valid_tag_list_no_asset): assert len(frame), 20 for frame in ncs_reader.load_series(dates[0], dates[1], valid_tag_list_no_asset): assert len(frame), 20
def _get_default_dataset_config(): from_ts = dateutil.parser.isoparse("2017-01-01T08:56:00+00:00") to_ts = dateutil.parser.isoparse("2017-01-01T10:01:00+00:00") return { "type": "TimeSeriesDataset", "from_ts": from_ts, "to_ts": to_ts, "tag_list": normalize_sensor_tags(["TRC-FIQ -39-0706", "GRA-EM-23-0003ARV.PV"]), "data_provider": DataLakeProvider(), }
def build(output_dir, model_config, data_config, metadata, model_register_dir): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- output_dir: str Directory to save model & metadata to. model_config: dict kwargs to be used in initializing the model. Should also contain kwarg 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild """ # TODO: Move all data related input from environment variable to data_config, # TODO: thereby removing all these data_config['variable'] lines data_config["tag_list"] = data_config.pop("tags") # TODO: Move parsing from here, into the InfluxDataSet class data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date")) # TODO: Move parsing from here, into the InfluxDataSet class data_config["to_ts"] = dateutil.parser.isoparse( data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = DataLakeProvider() asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_location = provide_saved_model(model_config, data_config, metadata, output_dir, model_register_dir) with open("/tmp/model-location.txt", "w") as f: f.write(model_location) return 0
def test_load_series_with_filter_bad_data(dates, remove_status_codes): ncs_reader = NcsReader( AzureDLFileSystemMock(), remove_status_codes=remove_status_codes ) valid_tag_list = normalize_sensor_tags(["TRC-322"]) series_gen = ncs_reader.load_series(dates[0], dates[1], valid_tag_list) # Checks if the bad data from the files under tests/gordo_components/data_provider/data/datalake/TRC-322 # are filtered out. 20 rows exists, 5 of then have the value 0. n_expected = 15 if remove_status_codes != [] else 20 assert all(len(series) == n_expected for series in series_gen)
def test_can_handle_tag_unknow_prefix_raise(ncs_reader): with pytest.raises(ValueError): ncs_reader.can_handle_tag(normalize_sensor_tags(["XYZ-123"])[0])
def test_normalize_sensor_tags_not_ok(): with pytest.raises(ValueError): tag_list_as_list_of_strings_nonsense = [TAG_NAME1, TAG_NAME2] normalize_sensor_tags(tag_list_as_list_of_strings_nonsense)
@pytest.fixture def ncs_reader(): return NcsReader(AzureDLFileSystemMock()) @pytest.fixture def dates(): return ( dateutil.parser.isoparse("2000-01-01T08:56:00+00:00"), dateutil.parser.isoparse("2001-09-01T10:01:00+00:00"), ) @pytest.mark.parametrize( "tag_to_check", [normalize_sensor_tags(["TRC-123"])[0], SensorTag("XYZ-123", "1776-TROC")], ) def test_can_handle_tag_ok(tag_to_check, ncs_reader): assert ncs_reader.can_handle_tag(tag_to_check) @pytest.mark.parametrize( "tag_to_check", [SensorTag("TRC-123", None), SensorTag("XYZ-123", "123-XXX")]) def test_can_handle_tag_notok(tag_to_check, ncs_reader): assert not ncs_reader.can_handle_tag(tag_to_check) def test_can_handle_tag_unknow_prefix_raise(ncs_reader):
def build( name, output_dir, model_config, data_config, metadata, model_register_dir, print_cv_scores, model_parameter, model_location_file, data_provider_threads, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. model_location_file: str/path Path to a file to open and write the location of the serialized model to. data_provider_threads: int Number of threads to use for the data provider when fetching data. """ # TODO: Move all data related input from environment variable to data_config, # TODO: thereby removing all these data_config['variable'] lines data_config["tag_list"] = data_config.pop("tags") # TODO: Move parsing from here, into the InfluxDataSet class data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date") ) # TODO: Move parsing from here, into the InfluxDataSet class data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads) asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition(pipeline_from_definition(model_config)) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir ) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model" ) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) # Write out the model location to this file. model_location_file.write(model_location) return 0
def __init__( self, data_provider: GordoBaseDataProvider, from_ts: datetime, to_ts: datetime, tag_list: List[Union[str, Dict, SensorTag]], target_tag_list: Optional[List[Union[str, Dict, SensorTag]]] = None, resolution: str = "10T", row_filter: str = "", aggregation_methods: Union[str, List[str], Callable] = "mean", **_kwargs, ): """ Creates a TimeSeriesDataset backed by a provided dataprovider. A TimeSeriesDataset is a dataset backed by timeseries, but resampled, aligned, and (optionally) filtered. Parameters ---------- data_provider: GordoBaseDataProvider A dataprovider which can provide dataframes for tags from from_ts to to_ts from_ts: datetime Earliest possible point in the dataset (inclusive) to_ts: datetime Earliest possible point in the dataset (exclusive) tag_list: List[Union[str, Dict, sensor_tag.SensorTag]] List of tags to include in the dataset. The elements can be strings, dictionaries or SensorTag namedtuples. target_tag_list: Optional[List[Union[str, Dict, sensor_tag.SensorTag]]] List of tags to set as the dataset y. These will be treated the same as tag_list when fetching and pre-processing (resampling) but will be split into the y return from ``.get_data()`` resolution: str The bucket size for grouping all incoming time data (e.g. "10T"). row_filter: str Filter on the rows. Only rows satisfying the filter will be in the dataset. See :func:`gordo_components.dataset.filter_rows.pandas_filter_rows` for further documentation of the filter format. aggregation_methods Aggregation method(s) to use for the resampled buckets. If a single resample method is provided then the resulting dataframe will have names identical to the names of the series it got in. If several aggregation-methods are provided then the resulting dataframe will have a multi-level column index, with the series-name as the first level, and the aggregation method as the second level. See :py:func::`pandas.core.resample.Resampler#aggregate` for more information on possible aggregation methods. _kwargs """ self.from_ts = from_ts self.to_ts = to_ts self.tag_list = normalize_sensor_tags(tag_list) self.target_tag_list = (normalize_sensor_tags(target_tag_list) if target_tag_list else []) self.resolution = resolution self.data_provider = data_provider self.row_filter = row_filter self.aggregation_methods = aggregation_methods if not self.from_ts.tzinfo or not self.to_ts.tzinfo: raise ValueError( f"Timestamps ({self.from_ts}, {self.to_ts}) need to include timezone " f"information")
def test_normalize_iroc_tags(): normalized_tags = normalize_sensor_tags(IROC_MANY_ASSETS_TAG_LIST) assert normalized_tags == IROC_MANY_ASSETS_SENSOR_TAG_LIST
def test_normalize_sensor_tags_ok(good_input_tags, asset, expected_output_tags): tag_list_as_list_of_sensor_tag = normalize_sensor_tags(good_input_tags, asset) assert tag_list_as_list_of_sensor_tag == expected_output_tags
@pytest.fixture def ncs_reader(): return NcsReader(AzureDLFileSystemMock()) @pytest.fixture def dates(): return ( dateutil.parser.isoparse("2000-01-01T08:56:00+00:00"), dateutil.parser.isoparse("2001-09-01T10:01:00+00:00"), ) @pytest.mark.parametrize( "tag_to_check", [normalize_sensor_tags(["TRC-123"])[0], SensorTag("XYZ-123", "1776-TROC")], ) def test_can_handle_tag_ok(tag_to_check, ncs_reader): assert ncs_reader.can_handle_tag(tag_to_check) @pytest.mark.parametrize( "tag_to_check", [SensorTag("TRC-123", None), SensorTag("XYZ-123", "123-XXX")] ) def test_can_handle_tag_notok(tag_to_check, ncs_reader): assert not ncs_reader.can_handle_tag(tag_to_check) def test_can_handle_tag_unknow_prefix_raise(ncs_reader): with pytest.raises(ValueError): ncs_reader.can_handle_tag(normalize_sensor_tags(["XYZ-123"])[0])
def get(self): context = dict() # type: typing.Dict[str, typing.Any] context["status-code"] = 200 start_time = timeit.default_timer() params = request.get_json() or request.args if not all(k in params for k in ("start", "end")): return ( { "error": "must provide iso8601 formatted dates with " "timezone-information for parameters 'start' and 'end'" }, 400, ) try: start = self._parse_iso_datetime(params["start"]) end = self._parse_iso_datetime(params["end"]) except ValueError: logger.error( f"Failed to parse start and/or end date to ISO: start: " f"{params['start']} - end: {params['end']}") return ( { "error": "Could not parse start/end date(s) into ISO datetime. " "must provide iso8601 formatted dates for both." }, 400, ) # Make request time span of one day if (end - start).days: return { "error": "Need to request a time span less than 24 hours." }, 400 freq = pd.tseries.frequencies.to_offset( current_app.metadata["dataset"]["resolution"]) dataset = TimeSeriesDataset( data_provider=g.data_provider, from_ts=start - freq.delta, to_ts=end, resolution=current_app.metadata["dataset"]["resolution"], tag_list=sensor_tag.normalize_sensor_tags( current_app.metadata["dataset"]["tag_list"]), ) X, _y = dataset.get_data() # Want resampled buckets equal or greater than start, but less than end # b/c if end == 00:00:00 and req = 10 mins, a resampled bucket starting # at 00:00:00 would imply it has data until 00:10:00; which is passed # the requested end datetime X = X[(X.index > start - freq.delta) & (X.index + freq.delta < end)] try: xhat = self.get_predictions(X).tolist() # Model may only be a transformer, probably an AttributeError, but catch all to avoid logging other # exceptions twice if it happens. except Exception as exc: logger.critical(f"Failed to predict or transform; error: {exc}") return ( { "error": "Something unexpected happened; check your input data" }, 400, ) # In GET requests we need to pair the resulting predictions with their # specific timestamp and additionally match the predictions to the corresponding tags. data = [] # This tags list is just for display/informative purposes, skipping the asset tags = [ tag["name"] for tag in current_app.metadata["dataset"]["tag_list"] ] for prediction, time_stamp in zip(xhat, X.index[-len(xhat):]): # Auto encoders return double their input. # First half is input to model, second half is output of model tag_inputs = np.array(prediction[:len(tags)]) tag_outputs = np.array(prediction[len(tags):]) tag_errors = np.abs(tag_inputs - tag_outputs) data.append({ "start": f"{time_stamp}", "end": f"{time_stamp + freq}", "tags": {tag: error for tag, error in zip(tags, tag_errors)}, "total_anomaly": np.linalg.norm(tag_inputs - tag_outputs), }) context["output"] = data context["time-seconds"] = f"{timeit.default_timer() - start_time:.4f}" return context, context["status-code"]
def tags(self) -> typing.List[SensorTag]: return normalize_sensor_tags(g.metadata["dataset"]["tag_list"])
def build( name, output_dir, model_config, data_config, data_provider, metadata, model_register_dir, print_cv_scores, model_parameter, evaluation_config, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset data_provider: str A quoted data provider configuration in JSON/YAML format. Should also contain key 'type' which references the data provider to use. Example:: '{"type": "DataLakeProvider", "storename" : "example_store"}' metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. evaluation_config: dict Dict of parameters which are exposed to build_model. - cv_mode: str String which enables three different modes, represented as a key value in evaluation_config: * cross_val_only: Only perform cross validation * build_only: Skip cross validation and only build the model * full_build: Cross validation and full build of the model, default value Example:: {"cv_mode": "cross_val_only"} """ data_config["tag_list"] = data_config.pop("tags") data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date")) data_config["to_ts"] = dateutil.parser.isoparse( data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = data_provider asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list # Normalize target tag list if present if "target_tag_list" in data_config: target_tag_list = normalize_sensor_tags(data_config["target_tag_list"], asset) data_config["target_tag_list"] = target_tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition( pipeline_from_definition(model_config)) logger.debug(f"Fully expanded model config: {model_config}") if evaluation_config["cv_mode"] == "cross_val_only": cache_model_location = None if model_register_dir is not None: cache_key = calculate_model_key(name, model_config, data_config, evaluation_config, metadata=metadata) cache_model_location = check_cache(model_register_dir, cache_key) if cache_model_location: metadata = load_metadata(cache_model_location) else: _, metadata = build_model(name, model_config, data_config, metadata, evaluation_config) else: model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, evaluation_config=evaluation_config, ) metadata = load_metadata(model_location) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: retrieved_metadata = metadata all_scores = get_all_score_strings(retrieved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model") model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, evaluation_config=evaluation_config, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) return 0