Beispiel #1
0
def test_builder_metadata(raw_model_config):
    """
    Ensure the builder works with various model configs and that each has
    expected/valid metadata results.
    """
    model_config = yaml.load(raw_model_config, Loader=yaml.FullLoader)
    data_config = get_random_data()
    machine = Machine(
        name="model-name", dataset=data_config, model=model_config, project_name="test"
    )
    model, machine_out = ModelBuilder(machine).build()
    # Check metadata, and only verify 'history' if it's a *Keras* type model
    machine_check(machine_out, "Keras" in raw_model_config)
Beispiel #2
0
def test_provide_saved_model_simple_happy_path(tmpdir):
    """
    Test provide_saved_model with no caching
    """
    model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}}
    data_config = get_random_data()
    output_dir = os.path.join(tmpdir, "model")
    machine = Machine(
        name="model-name", dataset=data_config, model=model_config, project_name="test"
    )
    ModelBuilder(machine).build(output_dir=output_dir)

    # Assert the model was saved at the location
    # Should be model file, and the metadata
    assert len(os.listdir(output_dir)) == 2
Beispiel #3
0
def test_output_scores_metadata():
    data_config = get_random_data()
    raw_model_config = f"""
            gordo.machine.model.anomaly.diff.DiffBasedAnomalyDetector:
                scaler: sklearn.preprocessing.MinMaxScaler
                base_estimator:
                    sklearn.compose.TransformedTargetRegressor:
                        transformer: sklearn.preprocessing.MinMaxScaler
                        regressor:
                            sklearn.pipeline.Pipeline:
                                steps:
                                - sklearn.preprocessing.MinMaxScaler
                                - gordo.machine.model.models.KerasAutoEncoder:
                                    kind: feedforward_hourglass
                                    batch_size: 3
                                    compression_factor: 0.5
                                    encoding_layers: 1
                                    func: tanh
                                    out_func: linear
                                    epochs: 1
            """

    model_config = yaml.load(raw_model_config, Loader=yaml.FullLoader)
    machine = Machine(name="model-name",
                      dataset=data_config,
                      model=model_config,
                      project_name="test")
    model, machine_out = ModelBuilder(machine).build()
    scores_metadata = machine_out.metadata.build_metadata.model.cross_validation.scores
    assert (scores_metadata["explained-variance-score-Tag-1"]["fold-mean"] +
            scores_metadata["explained-variance-score-Tag-2"]["fold-mean"]
            ) / 2 == pytest.approx(
                scores_metadata["explained-variance-score"]["fold-mean"])

    assert (
        scores_metadata["r2-score-Tag-1"]["fold-mean"] +
        scores_metadata["r2-score-Tag-2"]["fold-mean"]) / 2 == pytest.approx(
            scores_metadata["r2-score"]["fold-mean"])

    assert (scores_metadata["mean-squared-error-Tag-1"]["fold-mean"] +
            scores_metadata["mean-squared-error-Tag-2"]["fold-mean"]
            ) / 2 == pytest.approx(
                scores_metadata["mean-squared-error"]["fold-mean"])

    assert (scores_metadata["mean-absolute-error-Tag-1"]["fold-mean"] +
            scores_metadata["mean-absolute-error-Tag-2"]["fold-mean"]
            ) / 2 == pytest.approx(
                scores_metadata["mean-absolute-error"]["fold-mean"])
Beispiel #4
0
def test_get_metadata_helper(model: BaseEstimator, expect_empty_dict: bool):
    """
    Ensure the builder works with various model configs and that each has
    expected/valid metadata results.
    """

    X, y = np.random.random((1000, 4)), np.random.random((1000, ))

    model.fit(X, y)

    metadata = ModelBuilder._extract_metadata_from_model(model)

    # All the metadata we've implemented so far is 'history', so we'll check that
    if not expect_empty_dict:
        assert "history" in metadata
        assert all(name in metadata["history"]
                   for name in ("params", "loss", "accuracy"))
    else:
        assert dict() == metadata
Beispiel #5
0
def test_provide_saved_model_caching_handle_existing_different_register(tmpdir):
    """If the model exists in the model register, but the output_dir is not where
    the model is, the model is copied to the new location, unless the new location
    already exists. If it does then return it"""
    model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}}
    data_config = get_random_data()
    output_dir1 = os.path.join(tmpdir, "model1")
    output_dir2 = os.path.join(tmpdir, "model2")

    registry_dir = os.path.join(tmpdir, "registry")
    machine = Machine(
        name="model-name", dataset=data_config, model=model_config, project_name="test"
    )
    builder = ModelBuilder(machine)
    builder.build(output_dir=output_dir1, model_register_dir=registry_dir)

    builder.build(output_dir=output_dir2, model_register_dir=registry_dir)
    assert builder.cached_model_path == output_dir2

    builder.build(output_dir=output_dir2, model_register_dir=registry_dir)
    assert builder.cached_model_path == output_dir2
Beispiel #6
0
def test_get_metrics_dict_scaler(scaler, mock):
    mock_model = mock
    metrics_list = [sklearn.metrics.mean_squared_error]
    # make the features in y be in different scales
    y = pd.DataFrame(
        np.array([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]) * [1, 100],
        columns=["Tag 1", "Tag 2"],
    )
    metrics_dict = ModelBuilder.build_metrics_dict(metrics_list, y, scaler=scaler)
    metric_func = metrics_dict["mean-squared-error"]

    mock_model.predict = lambda _y: _y * [0.8, 1]
    mse_feature_one_wrong = metric_func(mock_model, y, y)
    mock_model.predict = lambda _y: _y * [1, 0.8]
    mse_feature_two_wrong = metric_func(mock_model, y, y)

    if scaler:
        assert np.isclose(mse_feature_one_wrong, mse_feature_two_wrong)
    else:
        assert not np.isclose(mse_feature_one_wrong, mse_feature_two_wrong)
Beispiel #7
0
def test_provide_saved_model_caching_handle_existing_same_dir(tmpdir):
    """If the model exists in the model register, and the path there is the
    same as output_dir, output_dir is returned"""
    model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}}
    data_config = get_random_data()
    output_dir = os.path.join(tmpdir, "model")
    registry_dir = os.path.join(tmpdir, "registry")
    machine = Machine(
        name="model-name", dataset=data_config, model=model_config, project_name="test"
    )
    builder = ModelBuilder(machine)
    builder.build(output_dir=output_dir, model_register_dir=registry_dir)
    assert builder.cached_model_path == output_dir

    # Saving to same output_dir as the one saved in the registry just returns the output_dir
    builder.build(output_dir=output_dir, model_register_dir=registry_dir)
    assert builder.cached_model_path == output_dir
Beispiel #8
0
def test_provide_saved_model_caching(
    should_be_equal: bool,
    metadata: Optional[Metadata],
    tag_list: Optional[List[SensorTag]],
    replace_cache,
    tmpdir,
):
    """
    Test provide_saved_model with caching and possible cache busting if tag_list, or replace_cache is set.

    Builds two models and checks if their model-creation-date's are the same,
    which will be if and only if there is caching.

    Parameters
    ----------
    should_be_equal : bool
        Do we expect the two generated models to be at the same location or not? I.e. do
        we expect caching.
    metadata: Metadata
        Optional metadata which will be used as metadata for the second model.
    tag_list
        Optional list of strings which be used as the taglist in the dataset for the
        second model.
    replace_cache: bool
        Should we force a model cache replacement?

    """

    if tag_list is None:
        tag_list = []
    if metadata is None:
        metadata = Metadata()

    model_config = {"sklearn.decomposition.pca.PCA": {"svd_solver": "auto"}}
    data_config = get_random_data()
    output_dir = os.path.join(tmpdir, "model")
    registry_dir = os.path.join(tmpdir, "registry")
    machine = Machine(name="model-name",
                      dataset=data_config,
                      model=model_config,
                      project_name="test")
    _, first_machine = ModelBuilder(machine).build(
        output_dir=output_dir, model_register_dir=registry_dir)

    if tag_list:
        data_config["tag_list"] = tag_list

    new_output_dir = os.path.join(tmpdir, "model2")
    _, second_machine = ModelBuilder(machine=Machine(
        name="model-name",
        dataset=data_config,
        model=model_config,
        metadata=metadata,
        project_name="test",
        runtime={"something": True},
    )).build(
        output_dir=new_output_dir,
        model_register_dir=registry_dir,
        replace_cache=replace_cache,
    )

    model1_creation_date = (
        first_machine.metadata.build_metadata.model.model_creation_date)
    model2_creation_date = (
        second_machine.metadata.build_metadata.model.model_creation_date)
    assert "something" in second_machine.runtime

    if should_be_equal:
        assert model1_creation_date == model2_creation_date
    else:
        assert model1_creation_date != model2_creation_date

    if metadata is not None:
        assert metadata.user_defined == second_machine.metadata.user_defined