Beispiel #1
0
def test_invalid_model(default_globals: dict):
    """
    Test invalid model with 'step' instead of 'steps'
    """
    element_str = """
        name: ct-23-0001-machine
        data_provider:
          threads: 10
        dataset:
          tags: [GRA-TE  -23-0733.PV, GRA-TT  -23-0719.PV, GRA-YE  -23-0751X.PV]
          target_tag_list: [GRA-TE -123-456]
          train_start_date: 2018-01-01T09:00:30Z
          train_end_date: 2018-01-02T09:00:30Z
        model:
          sklearn.pipeline.Pipeline:
            step:
              - sklearn.preprocessing.data.MinMaxScaler
              - gordo.machine.model.models.KerasAutoEncoder:
                  kind: feedforward_hourglass
        evaluation:
            scoring_scaler: Null
        metadata:
          id: special-id
    """
    element = get_dict_from_yaml(StringIO(element_str))
    with pytest.raises(ValueError):
        Machine.from_config(element,
                            project_name="test-project-name",
                            config_globals=default_globals)
Beispiel #2
0
def _machine(name: str) -> Machine:
    """
    Helper to build a basic Machine, only defining its name
    """
    from gordo_dataset.sensor_tag import SensorTag

    return Machine.from_config(
        config={
            "name": name,
            "dataset": {
                "tag_list": [SensorTag("tag-1", "foo"), SensorTag("tag-2", "foo")],
                "train_start_date": "2016-01-01T00:00:00Z",
                "train_end_date": "2016-01-05T00:00:00Z",
            },
            "model": {"sklearn.linear_model.LinearRegression": {}},
        },
        project_name="test-project",
    )
Beispiel #3
0
    def __init__(self, config: dict, project_name: str):
        default_globals = self.DEFAULT_CONFIG_GLOBALS
        default_globals["runtime"]["influx"][  # type: ignore
            "resources"] = _calculate_influx_resources(  # type: ignore
                len(config["machines"]))

        passed_globals = config.get("globals", dict())
        patched_globals = patch_dict(default_globals, passed_globals)
        if patched_globals.get("runtime"):
            patched_globals["runtime"] = fix_runtime(
                patched_globals.get("runtime"))
        self.project_name = project_name
        self.machines = [
            Machine.from_config(conf,
                                project_name=project_name,
                                config_globals=patched_globals)
            for conf in config["machines"]
        ]  # type: List[Machine]

        self.globals = patched_globals
Beispiel #4
0
    def __init__(
        self,
        config: dict,
        project_name: str,
        gordo_version: Optional[str] = None,
        model_builder_env: Optional[dict] = None,
    ):
        if gordo_version is None:
            gordo_version = __version__
        default_globals = self.get_default_globals(gordo_version)
        default_globals["runtime"]["influx"][  # type: ignore
            "resources"
        ] = _calculate_influx_resources(  # type: ignore
            len(config["machines"])
        )

        passed_globals = config.get("globals", dict())

        # keeping it for back-compatibility
        if model_builder_env is not None and not (
            passed_globals
            and "runtime" in passed_globals
            and "builder" in passed_globals["runtime"]
            and "env" in passed_globals["runtime"]["builder"]
        ):
            if "builder" not in default_globals["runtime"]:
                default_globals["runtime"]["builder"] = {}
            default_globals["runtime"]["builder"]["env"] = model_builder_env

        patched_globals = patch_dict(default_globals, passed_globals)
        patched_globals = self.prepare_patched_globals(patched_globals)

        self.project_name = project_name
        self.machines: List[Machine] = [
            Machine.from_config(
                conf, project_name=project_name, config_globals=patched_globals
            )
            for conf in config["machines"]
        ]

        self.globals: dict = patched_globals
Beispiel #5
0
def build(
    machine_config: dict,
    output_dir: str,
    model_register_dir: click.Path,
    print_cv_scores: bool,
    model_parameter: List[Tuple[str, Any]],
    exceptions_reporter_file: str,
    exceptions_report_level: str,
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    machine_config: dict
        A dict loadable by :class:`gordo.machine.Machine.from_config`
    output_dir: str
        Directory to save model & metadata to.
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple[str, Any]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.
    exceptions_reporter_file: str
        JSON output file for exception information
    exceptions_report_level: str
        Details level for exception reporting
    """

    try:
        if model_parameter and isinstance(machine_config["model"], str):
            parameters = dict(model_parameter)  # convert lib of tuples to dict
            machine_config["model"] = expand_model(machine_config["model"],
                                                   parameters)

        machine: Machine = Machine.from_config(
            machine_config, project_name=machine_config["project_name"])

        logger.info(f"Building, output will be at: {output_dir}")
        logger.info(f"Register dir: {model_register_dir}")

        # Convert the config into a pipeline, and back into definition to ensure
        # all default parameters are part of the config.
        logger.debug(f"Ensuring the passed model config is fully expanded.")
        machine.model = serializer.into_definition(
            serializer.from_definition(machine.model))
        logger.info(f"Fully expanded model config: {machine.model}")

        builder = ModelBuilder(machine=machine)

        _, machine_out = builder.build(output_dir,
                                       model_register_dir)  # type: ignore

        logger.debug("Reporting built machine.")
        machine_out.report()
        logger.debug("Finished reporting.")

        if "err" in machine.name:
            raise FileNotFoundError("undefined_file.parquet")

        if print_cv_scores:
            for score in get_all_score_strings(machine_out):
                print(score)

    except Exception:
        traceback.print_exc()
        exc_type, exc_value, exc_traceback = sys.exc_info()

        exit_code = _exceptions_reporter.exception_exit_code(exc_type)
        if exceptions_reporter_file:
            _exceptions_reporter.safe_report(
                cast(
                    ReportLevel,
                    ReportLevel.get_by_name(exceptions_report_level,
                                            ReportLevel.EXIT_CODE),
                ),
                exc_type,
                exc_value,
                exc_traceback,
                exceptions_reporter_file,
                max_message_len=2024 - 500,
            )
        sys.exit(exit_code)
    else:
        return 0
Beispiel #6
0
def test_influx_forwarder(influxdb, influxdb_uri, sensors, sensors_str):
    """
    Test that the forwarder creates correct points from a
    multi-indexed series
    """
    with patch.object(sensor_tag,
                      "_asset_from_tag_name",
                      return_value="default"):
        machine = Machine.from_config(
            config={
                "name": "some-target-name",
                "dataset": {
                    "tags": sensors_str,
                    "target_tag_list": sensors_str,
                    "train_start_date": "2016-01-01T00:00:00Z",
                    "train_end_date": "2016-01-05T00:00:00Z",
                    "resolution": "10T",
                },
                "model": "sklearn.linear_model.LinearRegression",
            },
            project_name="test-project",
        )

    # Feature outs which match length of tags
    # These should then be re-mapped to the sensor tag names
    input_keys = [("name1", i) for i, _ in enumerate(sensors)]

    # Feature outs which don't match the length of the tags
    # These will be kept at 0..N as field names
    # output_keys = [("name2", f"sensor_{i}") for i in range(len(sensors) * 2)]
    output_keys = [("name2", i) for i in range(len(sensors) * 2)]

    # Assign all keys unique numbers
    df = get_test_data(pd.MultiIndex.from_tuples(input_keys + output_keys))

    # Create the forwarder and forward the 'predictions' to influx.
    forwarder = ForwardPredictionsIntoInflux(
        destination_influx_uri=influxdb_uri)
    forwarder.forward_predictions(predictions=df, machine=machine)

    # Client to manually verify the points written
    client = influx_client_from_uri(influxdb_uri, dataframe_client=True)

    name1_results = client.query("SELECT * FROM name1")["name1"]

    # Should have column names: 'machine', 'sensor_name', 'sensor_value'
    assert all(c in name1_results.columns
               for c in ["machine", "sensor_name", "sensor_value"])

    # Check that values returned from InfluxDB match what put in for inputs
    for i, tag in enumerate(sensors_str):
        results_mask = name1_results["sensor_name"] == tag
        assert np.allclose(df[("name1", i)].values,
                           name1_results[results_mask]["sensor_value"].values)

    # Now check the other top level name "name2" is a measurement with the correct points written
    name2_results = client.query("SELECT * FROM name2")["name2"]

    # Should have the same names as tags, since all top levels get stacked into the same resulting columns
    assert all([
        c in name2_results.columns
        for c in ["machine", "sensor_name", "sensor_value"]
    ])

    # Check that values returned from InfluxDB match what put in for outputs
    # Note that here the influx sensor names for the output tags are string-cast integers
    for key in output_keys:
        results_mask = name2_results["sensor_name"] == str(key[1])
        assert np.allclose(df[key].values,
                           name2_results[results_mask]["sensor_value"].values)
Beispiel #7
0
def test_machine_from_config(default_globals: dict):
    """
    Test ability to create a Machine from a config element.
    """

    element_str = """
        name: ct-23-0001-machine
        data_provider:
          threads: 10
        dataset:
          tags: [GRA-TE  -23-0733.PV, GRA-TT  -23-0719.PV, GRA-YE  -23-0751X.PV]
          target_tag_list: [GRA-TE -123-456]
          train_start_date: 2018-01-01T09:00:30Z
          train_end_date: 2018-01-02T09:00:30Z
        model:
          sklearn.pipeline.Pipeline:
            steps:
              - sklearn.preprocessing.data.MinMaxScaler
              - gordo.machine.model.models.KerasAutoEncoder:
                  kind: feedforward_hourglass
        evaluation:
            scoring_scaler: Null
        metadata:
          id: special-id
    """
    element = get_dict_from_yaml(StringIO(element_str))
    machine = Machine.from_config(element,
                                  project_name="test-project-name",
                                  config_globals=default_globals)
    logger.info(f"{machine}")
    assert isinstance(machine, Machine)
    assert len(machine.dataset.tag_list) == 3

    # The metadata of machine should be json serializable
    json.dumps(machine.to_dict()["metadata"])

    # The metadata of machine should be ast.literal_eval-able when cast as a str
    assert (ast.literal_eval(str(
        machine.to_dict()["metadata"])) == machine.to_dict()["metadata"])
    # dictionary representation of the machine expected:
    expected = {
        "dataset": {
            "aggregation_methods":
            "mean",
            "asset":
            "global-asset",
            "data_provider": {
                "dl_service_auth_str": None,
                "interactive": False,
                "storename": "dataplatformdlsprod",
                "type": "DataLakeProvider",
            },
            "default_asset":
            None,
            "n_samples_threshold":
            0,
            "resolution":
            "10T",
            "row_filter":
            "",
            "row_filter_buffer_size":
            0,
            "tag_list": [
                "GRA-TE  -23-0733.PV",
                "GRA-TT  -23-0719.PV",
                "GRA-YE  -23-0751X.PV",
            ],
            "target_tag_list": ["GRA-TE -123-456"],
            "train_end_date":
            "2018-01-02T09:00:30+00:00",
            "train_start_date":
            "2018-01-01T09:00:30+00:00",
            "type":
            "TimeSeriesDataset",
        },
        "evaluation": {
            "cv_mode":
            "full_build",
            "metrics": [
                "explained_variance_score",
                "r2_score",
                "mean_squared_error",
                "mean_absolute_error",
            ],
            "scoring_scaler":
            None,
        },
        "metadata": {
            "build_metadata": {
                "model": {
                    "cross_validation": {
                        "cv_duration_sec": None,
                        "scores": {},
                        "splits": {},
                    },
                    "model_builder_version": __version__,
                    "model_creation_date": None,
                    "model_meta": {},
                    "model_offset": 0,
                    "model_training_duration_sec": None,
                },
                "dataset": {
                    "query_duration_sec": None,
                    "dataset_meta": {}
                },
            },
            "user_defined": {
                "global-metadata": {},
                "machine-metadata": {
                    "id": "special-id"
                },
            },
        },
        "model": {
            "sklearn.pipeline.Pipeline": {
                "steps": [
                    "sklearn.preprocessing.data.MinMaxScaler",
                    {
                        "gordo.machine.model.models.KerasAutoEncoder": {
                            "kind": "feedforward_hourglass"
                        }
                    },
                ]
            }
        },
        "name": "ct-23-0001-machine",
        "project_name": "test-project-name",
        "runtime": {
            "reporters": [],
            "server": {
                "resources": {
                    "limits": {
                        "cpu": 4,
                        "memory": 3
                    },
                    "requests": {
                        "cpu": 2,
                        "memory": 1
                    },
                }
            },
        },
    }
    assert machine.to_dict() == expected
Beispiel #8
0
def test_influx_forwarder(influxdb, influxdb_uri, sensors, sensors_str):
    """
    Test that the forwarder creates correct points from a
    multi-indexed series
    """
    with patch.object(sensor_tag,
                      "_asset_from_tag_name",
                      return_value="default"):
        machine = Machine.from_config(
            config={
                "name": "some-target-name",
                "dataset": {
                    "tags": sensors_str,
                    "target_tag_list": sensors_str,
                    "train_start_date": "2016-01-01T00:00:00Z",
                    "train_end_date": "2016-01-05T00:00:00Z",
                    "resolution": "10T",
                },
                "model": "sklearn.linear_model.LinearRegression",
            },
            project_name="test-project",
        )

    # Feature outs which match length of tags
    # These should then be re-mapped to the sensor tag names
    keys = [("name1", i) for i, _ in enumerate(sensors)]

    # Feature outs which don't match the length of the tags
    # These will be kept at 0..N as field names
    keys.extend([("name2", i) for i in range(len(sensors) * 2)])

    # Assign all keys unique numbers
    columns = pd.MultiIndex.from_tuples(keys)
    index = pd.date_range("2019-01-01", "2019-01-02", periods=4)
    df = pd.DataFrame(columns=columns, index=index)

    # Generate some unique values for each key, and insert it into that column
    for i, key in enumerate(keys):
        df[key] = range(i, i + 4)

    # Create the forwarder and forward the 'predictions' to influx.
    forwarder = ForwardPredictionsIntoInflux(
        destination_influx_uri=influxdb_uri)
    forwarder.forward_predictions(predictions=df, machine=machine)

    # Client to manually verify the points written
    client = influx_client_from_uri(influxdb_uri, dataframe_client=True)

    name1_results = client.query("SELECT * FROM name1")["name1"]

    # Should have the tag names as column names since the shape matched
    assert all(c in name1_results.columns for c in ["machine"] + sensors_str)
    for i, tag in enumerate(sensors_str):
        assert np.allclose(df[("name1", i)].values, name1_results[tag].values)

    # Now check the other top level name "name2" is a measurement with the correct points written
    name2_results = client.query("SELECT * FROM name2")["name2"]

    # Should not have the same names as tags, since shape was 2x as long, should just be numeric columns
    assert all([
        str(c) in name2_results.columns
        for c in ["machine"] + list(range(len(sensors) * 2))
    ])
    for key in filter(lambda k: k[0] == "name2", keys):
        assert np.allclose(df[key].values, name2_results[str(key[1])].values)
Beispiel #9
0
def build(
    machine_config: dict,
    output_dir: str,
    model_register_dir: click.Path,
    print_cv_scores: bool,
    model_parameter: List[Tuple[str, Any]],
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    machine_config: dict
        A dict loadable by :class:`gordo.machine.Machine.from_config`
    output_dir: str
        Directory to save model & metadata to.
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple[str, Any]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.
    """
    if model_parameter and isinstance(machine_config["model"], str):
        parameters = dict(model_parameter)  # convert lib of tuples to dict
        machine_config["model"] = expand_model(machine_config["model"],
                                               parameters)

    machine: Machine = Machine.from_config(
        machine_config, project_name=machine_config["project_name"])

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Register dir: {model_register_dir}")

    # Convert the config into a pipeline, and back into definition to ensure
    # all default parameters are part of the config.
    logger.debug(f"Ensuring the passed model config is fully expanded.")
    machine.model = serializer.into_definition(
        serializer.from_definition(machine.model))
    logger.info(f"Fully expanded model config: {machine.model}")

    builder = ModelBuilder(machine=machine)

    try:
        _, machine_out = builder.build(output_dir,
                                       model_register_dir)  # type: ignore

        logger.debug("Reporting built machine.")
        machine_out.report()
        logger.debug("Finished reporting.")

        if print_cv_scores:
            for score in get_all_score_strings(machine_out):
                print(score)

    except Exception as e:
        exit_code = EXCEPTION_TO_EXITCODE.get(e.__class__, 1)
        traceback.print_exc()
        sys.exit(exit_code)
    else:
        return 0