def __init__( self, destination_influx_uri: Optional[str] = None, destination_influx_api_key: Optional[str] = None, destination_influx_recreate: bool = False, ): """ Create an instance which, when called, is a coroutine capable of being sent dataframes generated from the '/anomaly/prediction' endpoint Parameters ---------- destination_influx_uri: str Connection string for destination influx - format: <username>:<password>@<host>:<port>/<optional-path>/<db_name> destination_influx_api_key: str API key if needed for destination db destination_influx_recreate: bool Drop the database before filling it with data? """ # Create df client if provided self.dataframe_client = (influx_client_from_uri( destination_influx_uri, api_key=destination_influx_api_key, recreate=destination_influx_recreate, dataframe_client=True, ) if destination_influx_uri else None)
def __init__( self, measurement: str, value_name: str = "Value", api_key: str = None, api_key_header: str = None, client: DataFrameClient = None, uri: str = None, **kwargs, ): """ Parameters ---------- measurement: str Name of the measurement to select from in Influx value_name: str Name of value to select, default to 'Value' api_key: str Api key to use in header api_key_header: str Key of header to insert the api key for requests uri: str Create a client from a URI format: <username>:<password>@<host>:<port>/<optional-path>/<db_name> kwargs: dict These are passed directly to the init args of influxdb.DataFrameClient """ super().__init__(**kwargs) self.measurement = measurement self.value_name = value_name self.influx_client = client if kwargs.pop("threads", None): logger.warning( "InfluxDataProvider got parameter 'threads' which is not supported, it " "will be ignored.") if self.influx_client is None: if uri: # Import here to avoid any circular import error caused by # importing TimeSeriesDataset, which imports this provider # which would have imported Client via traversal of the __init__ # which would then try to import TimeSeriesDataset again. from gordo_components.client.utils import influx_client_from_uri self.influx_client = influx_client_from_uri( # type: ignore uri, api_key=api_key, api_key_header=api_key_header, dataframe_client=True, ) else: self.influx_client = DataFrameClient(**kwargs) if api_key is not None: if not api_key_header: raise ValueError( "If supplying an api key, you must supply the header key to insert it under." ) self.influx_client._headers[api_key_header] = api_key
def test_client_cli_predict( influxdb, watchman_service, forwarder_args, output_dir, data_provider ): """ Test ability for client to get predictions via CLI """ runner = CliRunner() args = [ "client", "--metadata", "key,value", "--project", tu.GORDO_PROJECT, "predict", "2016-01-01T00:00:00Z", "2016-01-01T01:00:00Z", ] influx_client = client_utils.influx_client_from_uri( uri=tu.INFLUXDB_URI, dataframe_client=True ) query = """ SELECT * FROM "resampled" """ # Do we have forwarder args? if forwarder_args is not None: args.extend(forwarder_args) vals = influx_client.query(query) # There is no data there before we start doing things assert len(vals) == 0 # Should it write out the predictions to dataframes in an output directory? if output_dir is not None: args.extend(["--output-dir", output_dir.name]) # Do we have a data provider, POST else GET requests if data_provider is not None: args.extend(["--data-provider", json.dumps(data_provider.to_dict())]) # Run without any error out = runner.invoke(cli.gordo, args=args) assert out.exit_code == 0, f"{out.output}" # If we activated forwarder and we had any actual data then there should # be resampled values in the influx if forwarder_args and data_provider: vals = influx_client.query(query) assert len(vals) == 1 assert len(vals["resampled"]) == 28 influx_client.drop_measurement("resampled") # Did it save dataframes to output dir if specified? if output_dir is not None: assert os.path.exists( os.path.join(output_dir.name, f"{tu.GORDO_SINGLE_TARGET}.csv.gz") )
async def test_influx_forwarder(influxdb): """ Test that the forwarder creates correct points from a multi-indexed series """ endpoint = EndpointMetadata( "some-target-name", healthy=True, endpoint="/some-endpoint", tag_list=tu.SENSORTAG_LIST, target_tag_list=tu.SENSORTAG_LIST, resolution="10T", model_offset=0, ) # Feature outs which match length of tags # These should then be re-mapped to the sensor tag names keys = [("name1", i) for i, _ in enumerate(tu.SENSORTAG_LIST)] # Feature outs which don't match the length of the tags # These will be kept at 0..N as field names keys.extend([("name2", i) for i in range(len(tu.SENSORTAG_LIST) * 2)]) # Assign all keys unique numbers columns = pd.MultiIndex.from_tuples(keys) index = pd.date_range("2019-01-01", "2019-01-02", periods=4) df = pd.DataFrame(columns=columns, index=index) # Generate some unique values for each key, and insert it into that column for i, key in enumerate(keys): df[key] = range(i, i + 4) # Create the forwarder and forward the 'predictions' to influx. forwarder = ForwardPredictionsIntoInflux( destination_influx_uri=tu.INFLUXDB_URI) await forwarder.forward_predictions(predictions=df, endpoint=endpoint) # Client to manually verify the points written client = influx_client_from_uri(tu.INFLUXDB_URI, dataframe_client=True) name1_results = client.query("SELECT * FROM name1")["name1"] # Should have the tag names as column names since the shape matched assert all(c in name1_results.columns for c in ["machine"] + tu.SENSORS_STR_LIST) for i, tag in enumerate(tu.SENSORS_STR_LIST): assert np.allclose(df[("name1", i)].values, name1_results[tag].values) # Now check the other top level name "name2" is a measurement with the correct points written name2_results = client.query("SELECT * FROM name2")["name2"] # Should not have the same names as tags, since shape was 2x as long, should just be numeric columns assert all([ str(c) in name2_results.columns for c in ["machine"] + list(range(len(tu.SENSORTAG_LIST) * 2)) ]) for key in filter(lambda k: k[0] == "name2", keys): assert np.allclose(df[key].values, name2_results[str(key[1])].values)
def __init__( self, destination_influx_uri: Optional[str] = None, destination_influx_api_key: Optional[str] = None, destination_influx_recreate: bool = False, ): """ Create an instance which, when called, is a coroutine capable of being sent autoencoder prediction dataframes in which it will forward to influx By autoencoder prediction dataframes, we mean the columns are prefixed with 'output_' an 'input_' and then the tag/sensor name; and has a DatetimeIndex Parameters ---------- destination_influx_uri: str Connection string for destination influx - format: <username>:<password>@<host>:<port>/<optional-path>/<db_name> destination_influx_api_key: str API key if needed for destination db destination_influx_recreate: bool Drop the database before filling it with data? """ # Create clients if provided self.destionation_client = ( influx_client_from_uri( destination_influx_uri, api_key=destination_influx_api_key, recreate=destination_influx_recreate, ) if destination_influx_uri else None ) self.dataframe_client = ( influx_client_from_uri( destination_influx_uri, api_key=destination_influx_api_key, recreate=destination_influx_recreate, dataframe_client=True, ) if destination_influx_uri else None )
def test_get_list_of_tags(influxdb): ds = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=tu.INFLUXDB_URI, dataframe_client=True), ) expected_tags = set(tu.SENSORS_STR_LIST) tags = set(ds.get_list_of_tags()) assert expected_tags == tags # The cache does not screw stuff up tags = set(ds.get_list_of_tags()) assert expected_tags == tags
def test__list_of_tags_from_influx_validate_tag_names(influxdb): """ Test expected tags in influx match the ones actually in influx. """ ds = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=tu.INFLUXDB_URI, dataframe_client=True), ) list_of_tags = ds._list_of_tags_from_influx() expected_tags = tu.SENSORS_STR_LIST tags = set(list_of_tags) assert set(expected_tags) == tags, (f"Expected tags = {expected_tags}" f"outputted {tags}")
def test_read_single_sensor_empty_data_invalid_tag_name_valueerror(influxdb): """ Asserts that a ValueError is raised because the tag name inputted is invalid """ from_ts = dateutil.parser.isoparse("2016-01-01T09:11:00+00:00") to_ts = dateutil.parser.isoparse("2016-01-01T10:30:00+00:00") ds = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=tu.INFLUXDB_URI, dataframe_client=True), ) with pytest.raises(ValueError): ds.read_single_sensor( from_ts=from_ts, to_ts=to_ts, tag="tag-does-not-exist", measurement="sensors", )
def test_read_single_sensor_empty_data_time_range_indexerror(influxdb, caplog): """ Asserts that an IndexError is raised because the dates requested are outside the existing time period """ from_ts = dateutil.parser.isoparse("2017-01-01T09:11:00+00:00") to_ts = dateutil.parser.isoparse("2017-01-01T10:30:00+00:00") ds = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=tu.INFLUXDB_URI, dataframe_client=True), ) with caplog.at_level(logging.CRITICAL): with pytest.raises(IndexError): ds.read_single_sensor( from_ts=from_ts, to_ts=to_ts, tag=tu.SENSORS_STR_LIST[0], measurement="sensors", )
def test_influx_dataset_attrs(influxdb): """ Test expected attributes """ from_ts = dateutil.parser.isoparse("2016-01-01T09:11:00+00:00") to_ts = dateutil.parser.isoparse("2016-01-01T10:30:00+00:00") tag_list = tu.SENSORTAG_LIST config = { "type": "TimeSeriesDataset", "from_ts": from_ts, "to_ts": to_ts, "tag_list": tag_list, } config["data_provider"] = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=tu.INFLUXDB_URI, dataframe_client=True), ) dataset = _get_dataset(config) assert hasattr(dataset, "get_metadata") metadata = dataset.get_metadata() assert isinstance(metadata, dict)
def test_client_predictions_diff_batch_sizes_and_toggle_data_provider( influxdb, watchman_service, use_data_provider: bool, batch_size: int ): """ Run the prediction client with different batch-sizes and whether to use a data provider or not. """ # Time range used in this test start, end = ( isoparse("2016-01-01T00:00:00+00:00"), isoparse("2016-01-01T12:00:00+00:00"), ) # Client only used within the this test test_client = client_utils.influx_client_from_uri(tu.INFLUXDB_URI) # Created measurements by prediction client with dest influx query = f""" SELECT * FROM "model-output" WHERE("machine" =~ /^{tu.GORDO_SINGLE_TARGET}$/) """ # Before predicting, influx destination db should be empty for 'predictions' measurement vals = test_client.query(query) assert len(vals) == 0 data_provider = ( providers.InfluxDataProvider( measurement=tu.INFLUXDB_MEASUREMENT, value_name="Value", client=client_utils.influx_client_from_uri( uri=tu.INFLUXDB_URI, dataframe_client=True ), ) if use_data_provider else None ) prediction_client = Client( project=tu.GORDO_PROJECT, data_provider=data_provider, prediction_forwarder=ForwardPredictionsIntoInflux( destination_influx_uri=tu.INFLUXDB_URI ), batch_size=batch_size, ) # Should have discovered machine-1 assert len(prediction_client.endpoints) == 1 # All endpoints should be healthy assert all(ep.healthy for ep in prediction_client.endpoints) # Get predictions predictions = prediction_client.predict(start=start, end=end) assert isinstance(predictions, list) assert len(predictions) == 1 name, predictions, error_messages = predictions[0] # First dict of predictions assert isinstance(name, str) assert isinstance(predictions, pd.DataFrame) assert isinstance(error_messages, list) assert isinstance(predictions.index, pd.core.indexes.datetimes.DatetimeIndex) # This should have resulted in writting predictions to influx # Before predicting, influx destination db should be empty vals = test_client.query(query) assert ( len(vals) > 0 ), f"Expected new values in 'predictions' measurement, but found {vals}"