def __init__( self, destination_influx_uri: Optional[str] = None, destination_influx_api_key: Optional[str] = None, destination_influx_recreate: bool = False, n_retries=5, ): """ Create an instance which, when called, is a coroutine capable of being sent dataframes generated from the '/anomaly/prediction' machine Parameters ---------- destination_influx_uri: str Connection string for destination influx - format: <username>:<password>@<host>:<port>/<optional-path>/<db_name> destination_influx_api_key: str API key if needed for destination db destination_influx_recreate: bool Drop the database before filling it with data? """ # Create df client if provided self.n_retries = n_retries self.dataframe_client = ( influx_client_from_uri( destination_influx_uri, api_key=destination_influx_api_key, recreate=destination_influx_recreate, dataframe_client=True, ) if destination_influx_uri else None )
def __init__( self, measurement: str, value_name: str = "Value", api_key: str = None, api_key_header: str = None, client: DataFrameClient = None, uri: str = None, **kwargs, ): """ Parameters ---------- measurement: str Name of the measurement to select from in Influx value_name: str Name of value to select, default to 'Value' api_key: str Api key to use in header api_key_header: str Key of header to insert the api key for requests uri: str Create a client from a URI format: <username>:<password>@<host>:<port>/<optional-path>/<db_name> kwargs: dict These are passed directly to the init args of influxdb.DataFrameClient """ self.measurement = measurement self.value_name = value_name self.influx_client = client if kwargs.pop("threads", None): logger.warning( "InfluxDataProvider got parameter 'threads' which is not supported, it " "will be ignored.") if self.influx_client is None: if uri: # Import here to avoid any circular import error caused by # importing TimeSeriesDataset, which imports this provider # which would have imported Client via traversal of the __init__ # which would then try to import TimeSeriesDataset again. from gordo.client.utils import influx_client_from_uri self.influx_client = influx_client_from_uri( # type: ignore uri, api_key=api_key, api_key_header=api_key_header, dataframe_client=True, ) else: if "type" in kwargs: kwargs.pop("type") self.influx_client = DataFrameClient(**kwargs) if api_key is not None: if not api_key_header: raise ValueError( "If supplying an api key, you must supply the header key to insert it under." ) self.influx_client._headers[api_key_header] = api_key
def test_get_list_of_tags(influxdb, influxdb_uri, sensors_str): ds = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True), ) expected_tags = set(sensors_str) tags = set(ds.get_list_of_tags()) assert expected_tags == tags # The cache does not screw stuff up tags = set(ds.get_list_of_tags()) assert expected_tags == tags
def test__list_of_tags_from_influx_validate_tag_names(influxdb, influxdb_uri, sensors_str): """ Test expected tags in influx match the ones actually in influx. """ ds = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True), ) list_of_tags = ds._list_of_tags_from_influx() expected_tags = sensors_str tags = set(list_of_tags) assert set(expected_tags) == tags, (f"Expected tags = {expected_tags}" f"outputted {tags}")
def test_read_single_sensor_empty_data_invalid_tag_name_valueerror( influxdb, influxdb_uri): """ Asserts that a ValueError is raised because the tag name inputted is invalid """ train_start_date = dateutil.parser.isoparse("2016-01-01T09:11:00+00:00") train_end_date = dateutil.parser.isoparse("2016-01-01T10:30:00+00:00") ds = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True), ) with pytest.raises(ValueError): ds.read_single_sensor( train_start_date=train_start_date, train_end_date=train_end_date, tag="tag-does-not-exist", measurement="sensors", )
def test_influx_send_data(influxdb, influxdb_uri, sensors, sensors_str): """ """ df = get_test_data(sensors_str) # Create the forwarder and forward the sensor data to influx. forwarder = ForwardPredictionsIntoInflux(destination_influx_uri=influxdb_uri) forwarder.send_sensor_data(df) # Client to manually verify the points written client = influx_client_from_uri(influxdb_uri, dataframe_client=True) resampled_results = client.query("SELECT * FROM resampled")["resampled"] # Should have column names: 'sensor_name', 'sensor_value' assert all(c in resampled_results.columns for c in ["sensor_name", "sensor_value"]) # Check that values returned from InfluxDB match what put in for inputs for key in sensors_str: results_mask = resampled_results["sensor_name"] == key assert np.allclose( df[key].values, resampled_results[results_mask]["sensor_value"].values )
def test_read_single_sensor_empty_data_time_range_indexerror( influxdb, influxdb_uri, sensors_str, caplog): """ Asserts that an IndexError is raised because the dates requested are outside the existing time period """ train_start_date = dateutil.parser.isoparse("2017-01-01T09:11:00+00:00") train_end_date = dateutil.parser.isoparse("2017-01-01T10:30:00+00:00") ds = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True), ) with caplog.at_level(logging.CRITICAL): with pytest.raises(IndexError): ds.read_single_sensor( train_start_date=train_start_date, train_end_date=train_end_date, tag=sensors_str[0], measurement="sensors", )
def test_influx_dataset_attrs(influxdb, influxdb_uri, sensors): """ Test expected attributes """ train_start_date = dateutil.parser.isoparse("2016-01-01T09:11:00+00:00") train_end_date = dateutil.parser.isoparse("2016-01-01T10:30:00+00:00") tag_list = sensors config = { "type": "TimeSeriesDataset", "train_start_date": train_start_date, "train_end_date": train_end_date, "tag_list": tag_list, } config["data_provider"] = InfluxDataProvider( measurement="sensors", value_name="Value", client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True), ) dataset = _get_dataset(config) assert hasattr(dataset, "get_metadata") metadata = dataset.get_metadata() assert isinstance(metadata, dict)
def test_influx_forwarder(influxdb, influxdb_uri, sensors, sensors_str): """ Test that the forwarder creates correct points from a multi-indexed series """ with patch.object(sensor_tag, "_asset_from_tag_name", return_value="default"): machine = Machine.from_config( config={ "name": "some-target-name", "dataset": { "tags": sensors_str, "target_tag_list": sensors_str, "train_start_date": "2016-01-01T00:00:00Z", "train_end_date": "2016-01-05T00:00:00Z", "resolution": "10T", }, "model": "sklearn.linear_model.LinearRegression", }, project_name="test-project", ) # Feature outs which match length of tags # These should then be re-mapped to the sensor tag names input_keys = [("name1", i) for i, _ in enumerate(sensors)] # Feature outs which don't match the length of the tags # These will be kept at 0..N as field names # output_keys = [("name2", f"sensor_{i}") for i in range(len(sensors) * 2)] output_keys = [("name2", i) for i in range(len(sensors) * 2)] # Assign all keys unique numbers df = get_test_data(pd.MultiIndex.from_tuples(input_keys + output_keys)) # Create the forwarder and forward the 'predictions' to influx. forwarder = ForwardPredictionsIntoInflux( destination_influx_uri=influxdb_uri) forwarder.forward_predictions(predictions=df, machine=machine) # Client to manually verify the points written client = influx_client_from_uri(influxdb_uri, dataframe_client=True) name1_results = client.query("SELECT * FROM name1")["name1"] # Should have column names: 'machine', 'sensor_name', 'sensor_value' assert all(c in name1_results.columns for c in ["machine", "sensor_name", "sensor_value"]) # Check that values returned from InfluxDB match what put in for inputs for i, tag in enumerate(sensors_str): results_mask = name1_results["sensor_name"] == tag assert np.allclose(df[("name1", i)].values, name1_results[results_mask]["sensor_value"].values) # Now check the other top level name "name2" is a measurement with the correct points written name2_results = client.query("SELECT * FROM name2")["name2"] # Should have the same names as tags, since all top levels get stacked into the same resulting columns assert all([ c in name2_results.columns for c in ["machine", "sensor_name", "sensor_value"] ]) # Check that values returned from InfluxDB match what put in for outputs # Note that here the influx sensor names for the output tags are string-cast integers for key in output_keys: results_mask = name2_results["sensor_name"] == str(key[1]) assert np.allclose(df[key].values, name2_results[results_mask]["sensor_value"].values)
def test_client_cli_predict( influxdb, influxdb_uri, gordo_project, gordo_single_target, ml_server, tmpdir, use_forwarder, trained_model_directory, output_dir, use_parquet, session_config, ): """ Test ability for client to get predictions via CLI """ runner = CliRunner() args = ["client", "--metadata", "key,value", "--project", gordo_project] if session_config: args.extend(["--session-config", json.dumps(session_config)]) args.extend([ "predict", "--parquet" if use_parquet else "--no-parquet", "2016-01-01T00:00:00Z", "2016-01-01T01:00:00Z", ]) influx_client = client_utils.influx_client_from_uri(uri=influxdb_uri, dataframe_client=True) query = """ SELECT * FROM "resampled" """ # Do we have forwarder args? if use_forwarder: args.extend( ["--influx-uri", influxdb_uri, "--forward-resampled-sensors"]) vals = influx_client.query(query) # There is no data there before we start doing things assert len(vals) == 0 # Should it write out the predictions to dataframes in an output directory? if output_dir: args.extend(["--output-dir", str(tmpdir)]) # Do we have a data provider, POST else GET requests args.extend([ "--data-provider", json.dumps(providers.RandomDataProvider().to_dict()) ]) # Run without any error with patch( "gordo_dataset.sensor_tag._asset_from_tag_name", side_effect=lambda *args, **kwargs: "default", ): out = runner.invoke(cli.gordo, args=args) assert out.exit_code == 0, f"{out.output}" # If we activated forwarder and we had any actual data then there should # be resampled values in the influx if use_forwarder: vals = influx_client.query(query) assert len(vals) == 1 assert len(vals["resampled"]) == 48 influx_client.drop_measurement("resampled") # Did it save dataframes to output dir if specified? if output_dir: assert os.path.exists( os.path.join(tmpdir, f"{gordo_single_target}.csv.gz"))
def test_client_predictions_diff_batch_sizes( gordo_project, gordo_single_target, influxdb, influxdb_uri, influxdb_measurement, ml_server, batch_size: int, use_parquet: bool, ): """ Run the prediction client with different batch-sizes and whether to use a data provider or not. """ # Time range used in this test start, end = ( isoparse("2016-01-01T00:00:00+00:00"), isoparse("2016-01-01T12:00:00+00:00"), ) # Client only used within the this test test_client = client_utils.influx_client_from_uri(influxdb_uri) # Created measurements by prediction client with dest influx query = f""" SELECT * FROM "model-output" WHERE("machine" =~ /^{gordo_single_target}$/) """ # Before predicting, influx destination db should be empty for 'predictions' measurement vals = test_client.query(query) assert len(vals) == 0 data_provider = providers.InfluxDataProvider( measurement=influxdb_measurement, value_name="Value", client=client_utils.influx_client_from_uri(uri=influxdb_uri, dataframe_client=True), ) prediction_client = Client( project=gordo_project, data_provider=data_provider, prediction_forwarder=ForwardPredictionsIntoInflux( # type: ignore destination_influx_uri=influxdb_uri), batch_size=batch_size, use_parquet=use_parquet, parallelism=10, ) assert len(prediction_client.get_machine_names()) == 2 # Get predictions predictions = prediction_client.predict(start=start, end=end) assert isinstance(predictions, list) assert len(predictions) == 2 name, predictions, error_messages = predictions[ 0] # First dict of predictions assert isinstance(name, str) assert isinstance(predictions, pd.DataFrame) assert isinstance(error_messages, list) assert isinstance(predictions.index, pd.core.indexes.datetimes.DatetimeIndex) # This should have resulted in writting predictions to influx # Before predicting, influx destination db should be empty vals = test_client.query(query) assert ( len(vals) > 0 ), f"Expected new values in 'predictions' measurement, but found {vals}"
def test_influx_forwarder(influxdb, influxdb_uri, sensors, sensors_str): """ Test that the forwarder creates correct points from a multi-indexed series """ with patch.object(sensor_tag, "_asset_from_tag_name", return_value="default"): machine = Machine.from_config( config={ "name": "some-target-name", "dataset": { "tags": sensors_str, "target_tag_list": sensors_str, "train_start_date": "2016-01-01T00:00:00Z", "train_end_date": "2016-01-05T00:00:00Z", "resolution": "10T", }, "model": "sklearn.linear_model.LinearRegression", }, project_name="test-project", ) # Feature outs which match length of tags # These should then be re-mapped to the sensor tag names keys = [("name1", i) for i, _ in enumerate(sensors)] # Feature outs which don't match the length of the tags # These will be kept at 0..N as field names keys.extend([("name2", i) for i in range(len(sensors) * 2)]) # Assign all keys unique numbers columns = pd.MultiIndex.from_tuples(keys) index = pd.date_range("2019-01-01", "2019-01-02", periods=4) df = pd.DataFrame(columns=columns, index=index) # Generate some unique values for each key, and insert it into that column for i, key in enumerate(keys): df[key] = range(i, i + 4) # Create the forwarder and forward the 'predictions' to influx. forwarder = ForwardPredictionsIntoInflux( destination_influx_uri=influxdb_uri) forwarder.forward_predictions(predictions=df, machine=machine) # Client to manually verify the points written client = influx_client_from_uri(influxdb_uri, dataframe_client=True) name1_results = client.query("SELECT * FROM name1")["name1"] # Should have the tag names as column names since the shape matched assert all(c in name1_results.columns for c in ["machine"] + sensors_str) for i, tag in enumerate(sensors_str): assert np.allclose(df[("name1", i)].values, name1_results[tag].values) # Now check the other top level name "name2" is a measurement with the correct points written name2_results = client.query("SELECT * FROM name2")["name2"] # Should not have the same names as tags, since shape was 2x as long, should just be numeric columns assert all([ str(c) in name2_results.columns for c in ["machine"] + list(range(len(sensors) * 2)) ]) for key in filter(lambda k: k[0] == "name2", keys): assert np.allclose(df[key].values, name2_results[str(key[1])].values)