Exemple #1
0
def test_build_cv_mode_cross_val_cache(
    tmpdir,
    should_save_model: bool,
    cv_mode_1: str,
    cv_mode_2: str,
    runner: CliRunner,
    machine: Machine,
):
    """
    Checks that cv_scores uses cache if ran after a full build. Loads the same model, and can
    print the cv_scores from them.
    """
    logger.info(f"MODEL_CONFIG={json.dumps(machine.model)}")

    machine.evaluation = cv_mode_1  # type: ignore
    with temp_env_vars(MACHINE=json.dumps(machine.to_dict()),
                       OUTPUT_DIR=str(tmpdir)):
        runner.invoke(cli.gordo, ["build"])

    machine.evaluation = cv_mode_2  # type: ignore
    with temp_env_vars(MACHINE=json.dumps(machine.to_dict()),
                       OUTPUT_DIR=str(tmpdir)):
        runner.invoke(cli.gordo, ["build"])

    if should_save_model:
        assert len(os.listdir(tmpdir)) > 0
    else:
        assert len(os.listdir(tmpdir)) == 0
Exemple #2
0
def log_machine(mlflow_client: MlflowClient, run_id: str, machine: Machine):
    """
    Send logs to configured MLflow backend

    Parameters
    ----------
    mlflow_client: MlflowClient
        Client instance to call logging methods from.
    run_id: str
        Unique ID off MLflow Run to log to.
    machine: Machine
        Machine to log with MlflowClient.
    """
    # Log machine metrics and params
    for batch_kwargs in batch_log_items(*get_machine_log_items(machine)):
        mlflow_client.log_batch(run_id, **batch_kwargs)

    # Send configs as JSON artifacts
    try:
        with tempfile.TemporaryDirectory(dir="./") as tmp_dir:
            fp = os.path.join(tmp_dir, f"metadata.json")
            with open(fp, "w") as fh:
                json.dump(machine.to_dict(), fh, cls=MachineEncoder)
            mlflow_client.log_artifacts(run_id=run_id, local_dir=tmp_dir)
    # Map to MlflowLoggingError for coding errors in the model builder
    except Exception as e:
        raise MlflowLoggingError(e)
Exemple #3
0
    def report(self, machine: GordoMachine):
        """
        Log a machine to Postgres where top level keys, 'name', 'dataset', 'model',
        and 'metadata' mappings to BinaryJSON fields.

        Parameters
        ----------
        machine: gordo.machine.Machine

        Returns
        -------
        None
        """
        try:
            with self.db.atomic():
                logger.info(
                    f"Inserting machine {machine.name} in sql")  # type: ignore

                # Ensure it's serializable using MachineEncoder
                record = json.loads(
                    json.dumps(machine.to_dict(), cls=MachineEncoder))
                model = dict_to_model(Machine, record, ignore_unknown=True)
                try:
                    Machine.get(Machine.name == machine.name)
                except peewee.DoesNotExist:
                    model.save()
                else:
                    query = Machine.update(**model_to_dict(model)).where(
                        Machine.name == machine.name)
                    query.execute()

        except Exception as exc:
            raise PostgresReporterException(exc)
Exemple #4
0
    def __init__(self, machine: Machine):
        """
        Build a model for a given :class:`gordo.workflow.config_elements.machine.Machine`

        Parameters
        ----------
        machine: Machine

        Example
        -------
        >>> from gordo_dataset.sensor_tag import SensorTag
        >>> from gordo.machine import Machine
        >>> from gordo.dependencies import configure_once
        >>> configure_once()
        >>> machine = Machine(
        ...     name="special-model-name",
        ...     model={"sklearn.decomposition.PCA": {"svd_solver": "auto"}},
        ...     dataset={
        ...         "type": "RandomDataset",
        ...         "train_start_date": "2017-12-25 06:00:00Z",
        ...         "train_end_date": "2017-12-30 06:00:00Z",
        ...         "tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)],
        ...         "target_tag_list": [SensorTag("Tag 3", None), SensorTag("Tag 4", None)]
        ...     },
        ...     project_name='test-proj',
        ... )
        >>> builder = ModelBuilder(machine=machine)
        >>> model, machine = builder.build()
        """
        # Avoid overwriting the passed machine, copy doesn't work if it holds
        # reference to a loaded Tensorflow model; .to_dict() serializes it to
        # a primitive dict representation.
        self.machine = Machine(**machine.to_dict())
Exemple #5
0
def test_build_cv_mode(tmpdir, runner: CliRunner, should_save_model: bool,
                       cv_mode: str, machine: Machine):
    """
    Testing build with cv_mode set to full and cross_val_only. Checks that cv_scores are
    printed and model are only saved when using the default (full) value.
    """
    machine.model = MODEL_CONFIG_WITH_PREDICT
    machine.evaluation = cv_mode  # type: ignore

    logger.info(f"MODEL_CONFIG={json.dumps(machine.model)}")

    tmp_model_dir = os.path.join(tmpdir, "tmp")
    os.makedirs(tmp_model_dir, exist_ok=True)

    with temp_env_vars(MACHINE=json.dumps(machine.to_dict()),
                       OUTPUT_DIR=tmp_model_dir):
        result = runner.invoke(cli.gordo, ["build", "--print-cv-scores"])
        assert result.exit_code == 0
        # Checks that the file is empty or not depending on the mode.
        if should_save_model:
            assert len(os.listdir(tmp_model_dir)) != 0
        else:
            assert len(os.listdir(tmp_model_dir)) == 0

        # Checks the output contains 'explained-variance_raw-scores'
        assert "r2-score" in result.output
        assert "mean-squared-error" in result.output
        assert "mean-absolute-error" in result.output
        assert "explained-variance-score" in result.output
Exemple #6
0
def test_client_get_dataset(gordo_project, metadata, ml_server):
    data_provider = providers.RandomDataProvider(min_size=10)
    client = Client(project=gordo_project, data_provider=data_provider)
    start = isoparse("2016-01-01T00:00:00+00:00")
    end = isoparse("2016-01-01T12:00:00+00:00")
    machine = Machine(**metadata)
    assert type(machine.dataset) is TimeSeriesDataset
    machine.dataset.row_filter_buffer_size = 12
    machine.dataset.n_samples_threshold = 10
    client_machine = ClientMachine(**machine.to_dict())
    dataset = client._get_dataset(client_machine, start, end)
    assert dataset.row_filter_buffer_size == 0
    assert dataset.n_samples_threshold == 0
    assert dataset.low_threshold is None
    assert dataset.high_threshold is None
Exemple #7
0
def test_build_cv_mode_build_only(tmpdir, runner: CliRunner, machine: Machine):
    """
    Testing build with cv_mode set to build_only. Checks that OUTPUT_DIR gets a model
    saved to it. It also checks that the metadata contains cv-duration-sec=None and
    cv-scores={}
    """

    logger.info(f"MODEL_CONFIG={json.dumps(machine.model)}")
    machine.evaluation = {"cv_mode": "build_only"}

    with temp_env_vars(MACHINE=json.dumps(machine.to_dict()),
                       OUTPUT_DIR=str(tmpdir)):

        metadata_file = f"{os.path.join(tmpdir, 'metadata.json')}"
        runner.invoke(cli.gordo, ["build"])

        # A model has been saved
        assert len(os.listdir(tmpdir)) != 0
        with open(metadata_file) as f:
            metadata_json = json.loads(f.read())
            assert (metadata_json["metadata"]["build_metadata"]["model"]
                    ["cross_validation"]["cv_duration_sec"] is None)
            assert (metadata_json["metadata"]["build_metadata"]["model"]
                    ["cross_validation"]["scores"] == {})