コード例 #1
0
    def __init__(
        self,
        destination_influx_uri: Optional[str] = None,
        destination_influx_api_key: Optional[str] = None,
        destination_influx_recreate: bool = False,
        n_retries=5,
    ):
        """
        Create an instance which, when called, is a coroutine capable of
        being sent dataframes generated from the '/anomaly/prediction' machine

        Parameters
        ----------
        destination_influx_uri: str
            Connection string for destination influx -
            format: <username>:<password>@<host>:<port>/<optional-path>/<db_name>
        destination_influx_api_key: str
            API key if needed for destination db
        destination_influx_recreate: bool
            Drop the database before filling it with data?
        """
        # Create df client if provided
        self.n_retries = n_retries
        self.dataframe_client = (
            influx_client_from_uri(
                destination_influx_uri,
                api_key=destination_influx_api_key,
                recreate=destination_influx_recreate,
                dataframe_client=True,
            )
            if destination_influx_uri
            else None
        )
コード例 #2
0
    def __init__(
        self,
        measurement: str,
        value_name: str = "Value",
        api_key: str = None,
        api_key_header: str = None,
        client: DataFrameClient = None,
        uri: str = None,
        **kwargs,
    ):
        """
        Parameters
        ----------
        measurement: str
            Name of the measurement to select from in Influx
        value_name: str
            Name of value to select, default to 'Value'
        api_key: str
            Api key to use in header
        api_key_header: str
            Key of header to insert the api key for requests
        uri: str
            Create a client from a URI
            format: <username>:<password>@<host>:<port>/<optional-path>/<db_name>
        kwargs: dict
            These are passed directly to the init args of influxdb.DataFrameClient
        """
        self.measurement = measurement
        self.value_name = value_name
        self.influx_client = client
        if kwargs.pop("threads", None):
            logger.warning(
                "InfluxDataProvider got parameter 'threads' which is not supported, it "
                "will be ignored.")

        if self.influx_client is None:
            if uri:

                # Import here to avoid any circular import error caused by
                # importing TimeSeriesDataset, which imports this provider
                # which would have imported Client via traversal of the __init__
                # which would then try to import TimeSeriesDataset again.
                from gordo.client.utils import influx_client_from_uri

                self.influx_client = influx_client_from_uri(  # type: ignore
                    uri,
                    api_key=api_key,
                    api_key_header=api_key_header,
                    dataframe_client=True,
                )
            else:
                if "type" in kwargs:
                    kwargs.pop("type")
                self.influx_client = DataFrameClient(**kwargs)
                if api_key is not None:
                    if not api_key_header:
                        raise ValueError(
                            "If supplying an api key, you must supply the header key to insert it under."
                        )
                    self.influx_client._headers[api_key_header] = api_key
コード例 #3
0
def test_get_list_of_tags(influxdb, influxdb_uri, sensors_str):
    ds = InfluxDataProvider(
        measurement="sensors",
        value_name="Value",
        client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True),
    )
    expected_tags = set(sensors_str)

    tags = set(ds.get_list_of_tags())
    assert expected_tags == tags

    # The cache does not screw stuff up
    tags = set(ds.get_list_of_tags())
    assert expected_tags == tags
コード例 #4
0
def test__list_of_tags_from_influx_validate_tag_names(influxdb, influxdb_uri,
                                                      sensors_str):
    """
    Test expected tags in influx match the ones actually in influx.
    """
    ds = InfluxDataProvider(
        measurement="sensors",
        value_name="Value",
        client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True),
    )
    list_of_tags = ds._list_of_tags_from_influx()
    expected_tags = sensors_str
    tags = set(list_of_tags)
    assert set(expected_tags) == tags, (f"Expected tags = {expected_tags}"
                                        f"outputted {tags}")
コード例 #5
0
def test_read_single_sensor_empty_data_invalid_tag_name_valueerror(
        influxdb, influxdb_uri):
    """
    Asserts that a ValueError is raised because the tag name inputted is invalid
    """
    train_start_date = dateutil.parser.isoparse("2016-01-01T09:11:00+00:00")
    train_end_date = dateutil.parser.isoparse("2016-01-01T10:30:00+00:00")

    ds = InfluxDataProvider(
        measurement="sensors",
        value_name="Value",
        client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True),
    )
    with pytest.raises(ValueError):
        ds.read_single_sensor(
            train_start_date=train_start_date,
            train_end_date=train_end_date,
            tag="tag-does-not-exist",
            measurement="sensors",
        )
コード例 #6
0
ファイル: test_forwarders.py プロジェクト: JunFugithub/gordo
def test_influx_send_data(influxdb, influxdb_uri, sensors, sensors_str):
    """
    """
    df = get_test_data(sensors_str)

    # Create the forwarder and forward the sensor data to influx.
    forwarder = ForwardPredictionsIntoInflux(destination_influx_uri=influxdb_uri)
    forwarder.send_sensor_data(df)

    # Client to manually verify the points written
    client = influx_client_from_uri(influxdb_uri, dataframe_client=True)
    resampled_results = client.query("SELECT * FROM resampled")["resampled"]

    # Should have column names: 'sensor_name', 'sensor_value'
    assert all(c in resampled_results.columns for c in ["sensor_name", "sensor_value"])

    # Check that values returned from InfluxDB match what put in for inputs
    for key in sensors_str:
        results_mask = resampled_results["sensor_name"] == key
        assert np.allclose(
            df[key].values, resampled_results[results_mask]["sensor_value"].values
        )
コード例 #7
0
def test_read_single_sensor_empty_data_time_range_indexerror(
        influxdb, influxdb_uri, sensors_str, caplog):
    """
    Asserts that an IndexError is raised because the dates requested are outside the existing time period
    """
    train_start_date = dateutil.parser.isoparse("2017-01-01T09:11:00+00:00")
    train_end_date = dateutil.parser.isoparse("2017-01-01T10:30:00+00:00")

    ds = InfluxDataProvider(
        measurement="sensors",
        value_name="Value",
        client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True),
    )

    with caplog.at_level(logging.CRITICAL):
        with pytest.raises(IndexError):
            ds.read_single_sensor(
                train_start_date=train_start_date,
                train_end_date=train_end_date,
                tag=sensors_str[0],
                measurement="sensors",
            )
コード例 #8
0
def test_influx_dataset_attrs(influxdb, influxdb_uri, sensors):
    """
    Test expected attributes
    """
    train_start_date = dateutil.parser.isoparse("2016-01-01T09:11:00+00:00")
    train_end_date = dateutil.parser.isoparse("2016-01-01T10:30:00+00:00")
    tag_list = sensors
    config = {
        "type": "TimeSeriesDataset",
        "train_start_date": train_start_date,
        "train_end_date": train_end_date,
        "tag_list": tag_list,
    }
    config["data_provider"] = InfluxDataProvider(
        measurement="sensors",
        value_name="Value",
        client=influx_client_from_uri(uri=influxdb_uri, dataframe_client=True),
    )
    dataset = _get_dataset(config)
    assert hasattr(dataset, "get_metadata")

    metadata = dataset.get_metadata()
    assert isinstance(metadata, dict)
コード例 #9
0
ファイル: test_forwarders.py プロジェクト: koropets/gordo
def test_influx_forwarder(influxdb, influxdb_uri, sensors, sensors_str):
    """
    Test that the forwarder creates correct points from a
    multi-indexed series
    """
    with patch.object(sensor_tag,
                      "_asset_from_tag_name",
                      return_value="default"):
        machine = Machine.from_config(
            config={
                "name": "some-target-name",
                "dataset": {
                    "tags": sensors_str,
                    "target_tag_list": sensors_str,
                    "train_start_date": "2016-01-01T00:00:00Z",
                    "train_end_date": "2016-01-05T00:00:00Z",
                    "resolution": "10T",
                },
                "model": "sklearn.linear_model.LinearRegression",
            },
            project_name="test-project",
        )

    # Feature outs which match length of tags
    # These should then be re-mapped to the sensor tag names
    input_keys = [("name1", i) for i, _ in enumerate(sensors)]

    # Feature outs which don't match the length of the tags
    # These will be kept at 0..N as field names
    # output_keys = [("name2", f"sensor_{i}") for i in range(len(sensors) * 2)]
    output_keys = [("name2", i) for i in range(len(sensors) * 2)]

    # Assign all keys unique numbers
    df = get_test_data(pd.MultiIndex.from_tuples(input_keys + output_keys))

    # Create the forwarder and forward the 'predictions' to influx.
    forwarder = ForwardPredictionsIntoInflux(
        destination_influx_uri=influxdb_uri)
    forwarder.forward_predictions(predictions=df, machine=machine)

    # Client to manually verify the points written
    client = influx_client_from_uri(influxdb_uri, dataframe_client=True)

    name1_results = client.query("SELECT * FROM name1")["name1"]

    # Should have column names: 'machine', 'sensor_name', 'sensor_value'
    assert all(c in name1_results.columns
               for c in ["machine", "sensor_name", "sensor_value"])

    # Check that values returned from InfluxDB match what put in for inputs
    for i, tag in enumerate(sensors_str):
        results_mask = name1_results["sensor_name"] == tag
        assert np.allclose(df[("name1", i)].values,
                           name1_results[results_mask]["sensor_value"].values)

    # Now check the other top level name "name2" is a measurement with the correct points written
    name2_results = client.query("SELECT * FROM name2")["name2"]

    # Should have the same names as tags, since all top levels get stacked into the same resulting columns
    assert all([
        c in name2_results.columns
        for c in ["machine", "sensor_name", "sensor_value"]
    ])

    # Check that values returned from InfluxDB match what put in for outputs
    # Note that here the influx sensor names for the output tags are string-cast integers
    for key in output_keys:
        results_mask = name2_results["sensor_name"] == str(key[1])
        assert np.allclose(df[key].values,
                           name2_results[results_mask]["sensor_value"].values)
コード例 #10
0
def test_client_cli_predict(
    influxdb,
    influxdb_uri,
    gordo_project,
    gordo_single_target,
    ml_server,
    tmpdir,
    use_forwarder,
    trained_model_directory,
    output_dir,
    use_parquet,
    session_config,
):
    """
    Test ability for client to get predictions via CLI
    """
    runner = CliRunner()

    args = ["client", "--metadata", "key,value", "--project", gordo_project]
    if session_config:
        args.extend(["--session-config", json.dumps(session_config)])

    args.extend([
        "predict",
        "--parquet" if use_parquet else "--no-parquet",
        "2016-01-01T00:00:00Z",
        "2016-01-01T01:00:00Z",
    ])

    influx_client = client_utils.influx_client_from_uri(uri=influxdb_uri,
                                                        dataframe_client=True)
    query = """
        SELECT *
        FROM "resampled"
        """

    # Do we have forwarder args?
    if use_forwarder:
        args.extend(
            ["--influx-uri", influxdb_uri, "--forward-resampled-sensors"])
        vals = influx_client.query(query)
        # There is no data there before we start doing things
        assert len(vals) == 0

    # Should it write out the predictions to dataframes in an output directory?
    if output_dir:
        args.extend(["--output-dir", str(tmpdir)])

    # Do we have a data provider, POST else GET requests
    args.extend([
        "--data-provider",
        json.dumps(providers.RandomDataProvider().to_dict())
    ])

    # Run without any error
    with patch(
            "gordo_dataset.sensor_tag._asset_from_tag_name",
            side_effect=lambda *args, **kwargs: "default",
    ):
        out = runner.invoke(cli.gordo, args=args)
    assert out.exit_code == 0, f"{out.output}"

    # If we activated forwarder and we had any actual data then there should
    # be resampled values in the influx
    if use_forwarder:
        vals = influx_client.query(query)
        assert len(vals) == 1
        assert len(vals["resampled"]) == 48
        influx_client.drop_measurement("resampled")

    # Did it save dataframes to output dir if specified?
    if output_dir:
        assert os.path.exists(
            os.path.join(tmpdir, f"{gordo_single_target}.csv.gz"))
コード例 #11
0
def test_client_predictions_diff_batch_sizes(
    gordo_project,
    gordo_single_target,
    influxdb,
    influxdb_uri,
    influxdb_measurement,
    ml_server,
    batch_size: int,
    use_parquet: bool,
):
    """
    Run the prediction client with different batch-sizes and whether to use
    a data provider or not.
    """
    # Time range used in this test
    start, end = (
        isoparse("2016-01-01T00:00:00+00:00"),
        isoparse("2016-01-01T12:00:00+00:00"),
    )

    # Client only used within the this test
    test_client = client_utils.influx_client_from_uri(influxdb_uri)

    # Created measurements by prediction client with dest influx
    query = f"""
    SELECT *
    FROM "model-output"
    WHERE("machine" =~ /^{gordo_single_target}$/)
    """

    # Before predicting, influx destination db should be empty for 'predictions' measurement
    vals = test_client.query(query)
    assert len(vals) == 0

    data_provider = providers.InfluxDataProvider(
        measurement=influxdb_measurement,
        value_name="Value",
        client=client_utils.influx_client_from_uri(uri=influxdb_uri,
                                                   dataframe_client=True),
    )

    prediction_client = Client(
        project=gordo_project,
        data_provider=data_provider,
        prediction_forwarder=ForwardPredictionsIntoInflux(  # type: ignore
            destination_influx_uri=influxdb_uri),
        batch_size=batch_size,
        use_parquet=use_parquet,
        parallelism=10,
    )

    assert len(prediction_client.get_machine_names()) == 2

    # Get predictions
    predictions = prediction_client.predict(start=start, end=end)
    assert isinstance(predictions, list)
    assert len(predictions) == 2

    name, predictions, error_messages = predictions[
        0]  # First dict of predictions
    assert isinstance(name, str)
    assert isinstance(predictions, pd.DataFrame)
    assert isinstance(error_messages, list)

    assert isinstance(predictions.index,
                      pd.core.indexes.datetimes.DatetimeIndex)

    # This should have resulted in writting predictions to influx
    # Before predicting, influx destination db should be empty
    vals = test_client.query(query)
    assert (
        len(vals) > 0
    ), f"Expected new values in 'predictions' measurement, but found {vals}"
コード例 #12
0
def test_influx_forwarder(influxdb, influxdb_uri, sensors, sensors_str):
    """
    Test that the forwarder creates correct points from a
    multi-indexed series
    """
    with patch.object(sensor_tag,
                      "_asset_from_tag_name",
                      return_value="default"):
        machine = Machine.from_config(
            config={
                "name": "some-target-name",
                "dataset": {
                    "tags": sensors_str,
                    "target_tag_list": sensors_str,
                    "train_start_date": "2016-01-01T00:00:00Z",
                    "train_end_date": "2016-01-05T00:00:00Z",
                    "resolution": "10T",
                },
                "model": "sklearn.linear_model.LinearRegression",
            },
            project_name="test-project",
        )

    # Feature outs which match length of tags
    # These should then be re-mapped to the sensor tag names
    keys = [("name1", i) for i, _ in enumerate(sensors)]

    # Feature outs which don't match the length of the tags
    # These will be kept at 0..N as field names
    keys.extend([("name2", i) for i in range(len(sensors) * 2)])

    # Assign all keys unique numbers
    columns = pd.MultiIndex.from_tuples(keys)
    index = pd.date_range("2019-01-01", "2019-01-02", periods=4)
    df = pd.DataFrame(columns=columns, index=index)

    # Generate some unique values for each key, and insert it into that column
    for i, key in enumerate(keys):
        df[key] = range(i, i + 4)

    # Create the forwarder and forward the 'predictions' to influx.
    forwarder = ForwardPredictionsIntoInflux(
        destination_influx_uri=influxdb_uri)
    forwarder.forward_predictions(predictions=df, machine=machine)

    # Client to manually verify the points written
    client = influx_client_from_uri(influxdb_uri, dataframe_client=True)

    name1_results = client.query("SELECT * FROM name1")["name1"]

    # Should have the tag names as column names since the shape matched
    assert all(c in name1_results.columns for c in ["machine"] + sensors_str)
    for i, tag in enumerate(sensors_str):
        assert np.allclose(df[("name1", i)].values, name1_results[tag].values)

    # Now check the other top level name "name2" is a measurement with the correct points written
    name2_results = client.query("SELECT * FROM name2")["name2"]

    # Should not have the same names as tags, since shape was 2x as long, should just be numeric columns
    assert all([
        str(c) in name2_results.columns
        for c in ["machine"] + list(range(len(sensors) * 2))
    ])
    for key in filter(lambda k: k[0] == "name2", keys):
        assert np.allclose(df[key].values, name2_results[str(key[1])].values)