Ejemplo n.º 1
0
def test_invalid_model(default_globals: dict):
    """
    Test invalid model with 'step' instead of 'steps'
    """
    element_str = """
        name: ct-23-0001-machine
        data_provider:
          threads: 10
        dataset:
          tags: [GRA-TE  -23-0733.PV, GRA-TT  -23-0719.PV, GRA-YE  -23-0751X.PV]
          target_tag_list: [GRA-TE -123-456]
          train_start_date: 2018-01-01T09:00:30Z
          train_end_date: 2018-01-02T09:00:30Z
        model:
          sklearn.pipeline.Pipeline:
            step:
              - sklearn.preprocessing.data.MinMaxScaler
              - gordo.machine.model.models.KerasAutoEncoder:
                  kind: feedforward_hourglass
        evaluation:
            scoring_scaler: Null
        metadata:
          id: special-id
    """
    element = get_dict_from_yaml(StringIO(element_str))
    with pytest.raises(ValueError):
        Machine.from_config(element,
                            project_name="test-project-name",
                            config_globals=default_globals)
Ejemplo n.º 2
0
def test_basic_generation(path_to_config_files):
    """
    Model must be included in the config file

    start/end dates ...always included? or default to specific dates if not included?
    """

    project_name = "some-fancy-project-name"
    model_config = '{"sklearn.pipeline.Pipeline": {"steps": ["sklearn.preprocessing.data.MinMaxScaler", {"gordo.machine.model.models.KerasAutoEncoder": {"kind": "feedforward_hourglass"}}]}}'

    config_filename = "config-test-with-models.yml"
    expanded_template = _generate_test_workflow_str(
        path_to_config_files, config_filename, project_name=project_name
    )

    assert (
        project_name in expanded_template
    ), f"Expected to find project name: {project_name} in output: {expanded_template}"

    assert (
        model_config in expanded_template
    ), f"Expected to find model config: {model_config} in output: {expanded_template}"

    yaml_content = wg.get_dict_from_yaml(
        os.path.join(path_to_config_files, config_filename)
    )

    with patch.object(sensor_tag, "_asset_from_tag_name", return_value="default"):
        machines = NormalizedConfig(yaml_content, project_name=project_name).machines

    assert len(machines) == 2
Ejemplo n.º 3
0
def test_dataset_from_dict():
    """
    Test ability to create a Dataset from a config element
    """
    element_str = """
        name: ct-23-0002
        dataset:
          resolution: 2T
          tag_list:
            - GRA-YE  -23-0751X.PV
            - GRA-TE  -23-0698.PV
            - GRA-PIT -23-0619B.PV
          train_start_date: 2011-05-20T01:00:04+02:00
          train_end_date: 2018-05-10T15:05:50+02:00
    """
    dataset_config = get_dict_from_yaml(StringIO(element_str))["dataset"]
    dataset = TimeSeriesDataset.from_dict(dataset_config.copy())
    asdict = dataset.to_dict()
    assert asdict["tag_list"] == [
        "GRA-YE  -23-0751X.PV",
        "GRA-TE  -23-0698.PV",
        "GRA-PIT -23-0619B.PV",
    ]
    assert asdict["resolution"] == "2T"
    assert asdict["train_start_date"] == "2011-05-20T01:00:04+02:00"
    assert asdict["train_end_date"] == "2018-05-10T15:05:50+02:00"
Ejemplo n.º 4
0
def local_build(
    config_str: str, ) -> Iterable[Tuple[Union[BaseEstimator, None], Machine]]:
    """
    Build model(s) from a bare Gordo config file locally.

    This is very similar to the same steps as the normal workflow generation and subsequent
    Gordo deployment process makes. Should help developing locally,
    as well as giving a good indication that your config is valid for deployment
    with Gordo.

    Parameters
    ----------
    config_str: str
        The raw yaml config file in string format.

    Examples
    --------
    >>> import numpy as np
    >>> from gordo.dependencies import configure_once
    >>> configure_once()
    >>> config = '''
    ... machines:
    ...       - dataset:
    ...           tags:
    ...             - SOME-TAG1
    ...             - SOME-TAG2
    ...           target_tag_list:
    ...             - SOME-TAG3
    ...             - SOME-TAG4
    ...           train_end_date: '2019-03-01T00:00:00+00:00'
    ...           train_start_date: '2019-01-01T00:00:00+00:00'
    ...           asset: asgb
    ...           data_provider:
    ...             type: RandomDataProvider
    ...         metadata:
    ...           information: Some sweet information about the model
    ...         model:
    ...           gordo.machine.model.anomaly.diff.DiffBasedAnomalyDetector:
    ...             base_estimator:
    ...               sklearn.pipeline.Pipeline:
    ...                 steps:
    ...                 - sklearn.decomposition.PCA
    ...                 - sklearn.multioutput.MultiOutputRegressor:
    ...                     estimator: sklearn.linear_model.LinearRegression
    ...         name: crazy-sweet-name
    ... '''
    >>> models_n_metadata = local_build(config)
    >>> assert len(list(models_n_metadata)) == 1

    Returns
    -------
    Iterable[Tuple[Union[BaseEstimator, None], Machine]]
        A generator yielding tuples of models and their metadata.
    """

    config = get_dict_from_yaml(io.StringIO(config_str))
    normed = NormalizedConfig(config, project_name="local-build")
    for machine in normed.machines:
        yield ModelBuilder(machine=machine).build()
Ejemplo n.º 5
0
def unique_tag_list_cli(machine_config: str, output_file_tag_list: str):

    yaml_content = wg.get_dict_from_yaml(machine_config)

    machines = NormalizedConfig(yaml_content, project_name="test-proj-name").machines

    tag_list = set(tag for machine in machines for tag in machine.dataset.tag_list)

    if output_file_tag_list:
        with open(output_file_tag_list, "w") as output_file:
            for tag in tag_list:
                output_file.write(f"{tag.name}\n")
    else:
        for tag in tag_list:
            print(tag.name)
Ejemplo n.º 6
0
def workflow_generator_cli(gordo_ctx, **ctx):
    """
    Machine Configuration to Argo Workflow
    """

    context: Dict[Any, Any] = ctx.copy()
    yaml_content = wg.get_dict_from_yaml(context["machine_config"])

    try:
        log_level = yaml_content["globals"]["runtime"]["log_level"]
    except KeyError:
        log_level = os.getenv("GORDO_LOG_LEVEL", gordo_ctx.obj["log_level"])

    logging.getLogger("gordo").setLevel(log_level.upper())
    context["log_level"] = log_level.upper()

    # Create normalized config
    config = NormalizedConfig(yaml_content,
                              project_name=context["project_name"])

    context["max_server_replicas"] = (context.pop("n_servers")
                                      or len(config.machines) * 10)

    # We know these exist since we set them in the default globals
    builder_resources = config.globals["runtime"]["builder"]["resources"]
    context["model_builder_resources_requests_memory"] = builder_resources[
        "requests"]["memory"]
    context["model_builder_resources_requests_cpu"] = builder_resources[
        "requests"]["cpu"]
    context["model_builder_resources_limits_memory"] = builder_resources[
        "limits"]["memory"]
    context["model_builder_resources_limits_cpu"] = builder_resources[
        "limits"]["cpu"]

    context["server_resources"] = config.globals["runtime"]["server"][
        "resources"]

    # These are also set in the default globals, and guaranteed to exist
    client_resources = config.globals["runtime"]["client"]["resources"]
    context["client_resources_requests_memory"] = client_resources["requests"][
        "memory"]
    context["client_resources_requests_cpu"] = client_resources["requests"][
        "cpu"]
    context["client_resources_limits_memory"] = client_resources["limits"][
        "memory"]
    context["client_resources_limits_cpu"] = client_resources["limits"]["cpu"]

    context["client_max_instances"] = config.globals["runtime"]["client"][
        "max_instances"]

    influx_resources = config.globals["runtime"]["influx"]["resources"]
    context["influx_resources_requests_memory"] = influx_resources["requests"][
        "memory"]
    context["influx_resources_requests_cpu"] = influx_resources["requests"][
        "cpu"]
    context["influx_resources_limits_memory"] = influx_resources["limits"][
        "memory"]
    context["influx_resources_limits_cpu"] = influx_resources["limits"]["cpu"]

    nr_of_models_with_clients = len([
        machine for machine in config.machines
        if machine.runtime.get("influx", {}).get("enable", True)
    ])
    context["client_total_instances"] = nr_of_models_with_clients

    # Should we start up influx/grafana at all, i.e. is there at least one request
    # for it?"
    enable_influx = nr_of_models_with_clients > 0
    context["enable_influx"] = enable_influx

    context["postgres_host"] = f"gordo-postgres-{config.project_name}"

    # If enabling influx, we setup a postgres reporter to send metadata
    # to allowing querying about the machine from grafana
    if enable_influx:
        pg_reporter = {
            "gordo.reporters.postgres.PostgresReporter": {
                "host": context["postgres_host"]
            }
        }
        for machine in config.machines:
            machine.runtime["reporters"].append(pg_reporter)

    # Determine if MlFlowReporter should be enabled per machine
    for machine in config.machines:
        try:
            enabled = machine.runtime["builder"]["remote_logging"]["enable"]
        except KeyError:
            continue
        else:
            if enabled:
                machine.runtime["reporters"].append(
                    "gordo.reporters.mlflow.MlFlowReporter")

    context["machines"] = config.machines

    # Context requiring pre-processing
    context["target_names"] = [machine.name for machine in config.machines]

    # Json dump owner_references, if not None, otherwise pop it out of the context
    if context["owner_references"]:
        context["owner_references"] = json.dumps(context["owner_references"])
    else:
        context.pop("owner_references")

    builder_exceptions_report_level = get_builder_exceptions_report_level(
        config)
    context[
        "builder_exceptions_report_level"] = builder_exceptions_report_level.name
    if builder_exceptions_report_level != ReportLevel.EXIT_CODE:
        context["builder_exceptions_report_file"] = "/tmp/exception.json"

    if context["workflow_template"]:
        template = wg.load_workflow_template(context["workflow_template"])
    else:
        workflow_template = pkg_resources.resource_filename(
            "gordo.workflow.workflow_generator.resources",
            "argo-workflow.yml.template")
        template = wg.load_workflow_template(workflow_template)

    # Clear output file
    if context["output_file"]:
        open(context["output_file"], "w").close()  # type: ignore
    for i in range(0, len(config.machines),
                   context["split_workflows"]):  # type: ignore
        logger.info(
            f"Generating workflow for machines {i} to {i + context['split_workflows']}"
        )
        context["machines"] = config.machines[i:i + context["split_workflows"]]

        if context["output_file"]:
            s = template.stream(**context)
            with open(context["output_file"], "a") as f:  # type: ignore
                if i != 0:
                    f.write("\n---\n")
                s.dump(f)
        else:
            output = template.render(**context)
            if i != 0:
                print("\n---\n")
            print(output)
Ejemplo n.º 7
0
def test_machine_from_config(default_globals: dict):
    """
    Test ability to create a Machine from a config element.
    """

    element_str = """
        name: ct-23-0001-machine
        data_provider:
          threads: 10
        dataset:
          tags: [GRA-TE  -23-0733.PV, GRA-TT  -23-0719.PV, GRA-YE  -23-0751X.PV]
          target_tag_list: [GRA-TE -123-456]
          train_start_date: 2018-01-01T09:00:30Z
          train_end_date: 2018-01-02T09:00:30Z
        model:
          sklearn.pipeline.Pipeline:
            steps:
              - sklearn.preprocessing.data.MinMaxScaler
              - gordo.machine.model.models.KerasAutoEncoder:
                  kind: feedforward_hourglass
        evaluation:
            scoring_scaler: Null
        metadata:
          id: special-id
    """
    element = get_dict_from_yaml(StringIO(element_str))
    machine = Machine.from_config(element,
                                  project_name="test-project-name",
                                  config_globals=default_globals)
    logger.info(f"{machine}")
    assert isinstance(machine, Machine)
    assert len(machine.dataset.tag_list) == 3

    # The metadata of machine should be json serializable
    json.dumps(machine.to_dict()["metadata"])

    # The metadata of machine should be ast.literal_eval-able when cast as a str
    assert (ast.literal_eval(str(
        machine.to_dict()["metadata"])) == machine.to_dict()["metadata"])
    # dictionary representation of the machine expected:
    expected = {
        "dataset": {
            "aggregation_methods":
            "mean",
            "asset":
            "global-asset",
            "data_provider": {
                "dl_service_auth_str": None,
                "interactive": False,
                "storename": "dataplatformdlsprod",
                "type": "DataLakeProvider",
            },
            "default_asset":
            None,
            "n_samples_threshold":
            0,
            "resolution":
            "10T",
            "row_filter":
            "",
            "row_filter_buffer_size":
            0,
            "tag_list": [
                "GRA-TE  -23-0733.PV",
                "GRA-TT  -23-0719.PV",
                "GRA-YE  -23-0751X.PV",
            ],
            "target_tag_list": ["GRA-TE -123-456"],
            "train_end_date":
            "2018-01-02T09:00:30+00:00",
            "train_start_date":
            "2018-01-01T09:00:30+00:00",
            "type":
            "TimeSeriesDataset",
        },
        "evaluation": {
            "cv_mode":
            "full_build",
            "metrics": [
                "explained_variance_score",
                "r2_score",
                "mean_squared_error",
                "mean_absolute_error",
            ],
            "scoring_scaler":
            None,
        },
        "metadata": {
            "build_metadata": {
                "model": {
                    "cross_validation": {
                        "cv_duration_sec": None,
                        "scores": {},
                        "splits": {},
                    },
                    "model_builder_version": __version__,
                    "model_creation_date": None,
                    "model_meta": {},
                    "model_offset": 0,
                    "model_training_duration_sec": None,
                },
                "dataset": {
                    "query_duration_sec": None,
                    "dataset_meta": {}
                },
            },
            "user_defined": {
                "global-metadata": {},
                "machine-metadata": {
                    "id": "special-id"
                },
            },
        },
        "model": {
            "sklearn.pipeline.Pipeline": {
                "steps": [
                    "sklearn.preprocessing.data.MinMaxScaler",
                    {
                        "gordo.machine.model.models.KerasAutoEncoder": {
                            "kind": "feedforward_hourglass"
                        }
                    },
                ]
            }
        },
        "name": "ct-23-0001-machine",
        "project_name": "test-project-name",
        "runtime": {
            "reporters": [],
            "server": {
                "resources": {
                    "limits": {
                        "cpu": 4,
                        "memory": 3
                    },
                    "requests": {
                        "cpu": 2,
                        "memory": 1
                    },
                }
            },
        },
    }
    assert machine.to_dict() == expected