def test_postgres_reporter(postgresdb, metadata): """ Check logging of a machine into postgres """ reporter1 = PostgresReporter(host="localhost") machine1 = Machine(**metadata) # Before inserting, the machine does not exist. with pytest.raises(peewee.DoesNotExist): PostgresMachine.get(PostgresMachine.name == machine1.name) reporter1.report(machine1) record = PostgresMachine.get(PostgresMachine.name == machine1.name) assert record.name == machine1.name # Create another logger to ensure nothing happened to the DB reporter2 = PostgresReporter(host="localhost") machine2 = Machine(**metadata) machine2.name = "another-machine" reporter2.report(machine2) # The first machine is still there record = PostgresMachine.get(PostgresMachine.name == machine1.name) assert record.name == machine1.name # And the second record = PostgresMachine.get(PostgresMachine.name == machine2.name) assert record.name == machine2.name
def __init__(self, machine: Machine): """ Build a model for a given :class:`gordo.workflow.config_elements.machine.Machine` Parameters ---------- machine: Machine Example ------- >>> from gordo_dataset.sensor_tag import SensorTag >>> from gordo.machine import Machine >>> from gordo.dependencies import configure_once >>> configure_once() >>> machine = Machine( ... name="special-model-name", ... model={"sklearn.decomposition.PCA": {"svd_solver": "auto"}}, ... dataset={ ... "type": "RandomDataset", ... "train_start_date": "2017-12-25 06:00:00Z", ... "train_end_date": "2017-12-30 06:00:00Z", ... "tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)], ... "target_tag_list": [SensorTag("Tag 3", None), SensorTag("Tag 4", None)] ... }, ... project_name='test-proj', ... ) >>> builder = ModelBuilder(machine=machine) >>> model, machine = builder.build() """ # Avoid overwriting the passed machine, copy doesn't work if it holds # reference to a loaded Tensorflow model; .to_dict() serializes it to # a primitive dict representation. self.machine = Machine(**machine.to_dict())
def machine(): return Machine( name="test-model", model=MODEL_CONFIG, dataset=DATA_CONFIG, project_name="project-name", )
def test_model_builder_metrics_list(metrics_: Optional[List[str]]): model_config = { "sklearn.multioutput.MultiOutputRegressor": { "estimator": "sklearn.linear_model.LinearRegression" } } data_config = get_random_data() evaluation_config: Dict[str, Any] = {"cv_mode": "full_build"} if metrics_: evaluation_config.update({"metrics": metrics_}) machine = Machine( name="model-name", dataset=data_config, model=model_config, evaluation=evaluation_config, project_name="test", ) _model, machine = ModelBuilder(machine).build() expected_metrics = metrics_ or [ "sklearn.metrics.explained_variance_score", "sklearn.metrics.r2_score", "sklearn.metrics.mean_squared_error", "sklearn.metrics.mean_absolute_error", ] assert all( metric.split(".")[-1].replace("_", "-") in machine.metadata.build_metadata.model.cross_validation.scores for metric in expected_metrics)
def test_setting_seed(seed, model_config): """ Test that we can set the seed and get same results. """ data_config = get_random_data() evaluation_config = {"cv_mode": "full_build", "seed": seed} # Training two instances, without a seed should result in different scores, # while doing it with a seed should result in the same scores. machine = Machine( name="model-name", dataset=data_config, model=model_config, evaluation=evaluation_config, project_name="test", ) _model, machine1 = ModelBuilder(machine).build() _model, machine2 = ModelBuilder(machine).build() df1 = pd.DataFrame.from_dict( machine1.metadata.build_metadata.model.cross_validation.scores) df2 = pd.DataFrame.from_dict( machine2.metadata.build_metadata.model.cross_validation.scores) # Equality depends on the seed being set. if seed: assert df1.equals(df2) else: assert not df1.equals(df2)
def test_n_splits_from_config(mocked_pipeline_from_definition, cv): """ Test that we can set arbitrary splitters and parameters in the config file which is called by the serializer. """ data_config = get_random_data() evaluation_config = {"cv_mode": "full_build"} if cv: evaluation_config["cv"] = cv model_config = { "sklearn.multioutput.MultiOutputRegressor": { "estimator": "sklearn.ensemble.forest.RandomForestRegressor" } } machine = Machine( name="model-name", dataset=data_config, model=model_config, evaluation=evaluation_config, project_name="test", ) ModelBuilder(machine).build() if cv: mocked_pipeline_from_definition.assert_called_with(cv) else: mocked_pipeline_from_definition.assert_called_with( {"sklearn.model_selection.TimeSeriesSplit": { "n_splits": 3 }})
def test_scores_metadata(raw_model_config): data_config = get_random_data() model_config = yaml.load(raw_model_config, Loader=yaml.FullLoader) machine = Machine( dataset=data_config, model=model_config, name="model-name", project_name="test" ) model, machine_out = ModelBuilder(machine).build() machine_check(machine_out, False)
def test_get_machine_log_items(metadata): """ Test that dicts are correctly converted to MLflow types or errors raised """ metrics, params = mlu.get_machine_log_items(Machine(**metadata)) assert all(type(m) == Metric for m in metrics) assert all(type(p) == Param for p in params)
def test_builder_calls_machine_report(mocked_report_method, metadata): """ When building a machine, the Modelbuilder.build should call Machine.report() so that it can run any reporters in the Machine's runtime. """ machine = Machine(**metadata) ModelBuilder(machine).build() assert mocked_report_method.called_once()
def test_overwrite_report(postgresdb, metadata): """ Ensure saving same machine is ok. """ reporter1 = PostgresReporter(host="localhost") reporter2 = PostgresReporter(host="localhost") machine1 = Machine(**metadata) machine2 = Machine(**metadata) reporter1.report(machine1) # Reporting twice should be ok. reporter2.report(machine2) results = PostgresMachine.select().where( PostgresMachine.name == machine1.name) assert len([result for result in results]) == 1
def _machine_from_server(self, name: str, revision: str) -> Machine: resp = self.session.get( f"{self.base_url}/gordo/v0/{self.project_name}/{name}/metadata", params={"revision": revision}, ) metadata = _handle_response( resp=resp, resource_name=f"Machine metadata for {name}") if isinstance(metadata, dict) and metadata.get("metadata", None): return Machine(**metadata.get("metadata", None)) else: raise NotFound(f"Machine {name} not found")
def test_builder_metadata(raw_model_config): """ Ensure the builder works with various model configs and that each has expected/valid metadata results. """ model_config = yaml.load(raw_model_config, Loader=yaml.FullLoader) data_config = get_random_data() machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) model, machine_out = ModelBuilder(machine).build() # Check metadata, and only verify 'history' if it's a *Keras* type model machine_check(machine_out, "Keras" in raw_model_config)
def test_builder_with_reporter(postgresdb, metadata): """ Verify a model can take a reporter and .report() will run any given reporters """ reporter = PostgresReporter(host="localhost") metadata["runtime"]["reporters"].append(reporter.to_dict()) machine = Machine(**metadata) with pytest.raises(peewee.DoesNotExist): PostgresMachine.get(PostgresMachine.name == machine.name) machine.report() PostgresMachine.get(PostgresMachine.name == machine.name)
def test_client_get_dataset(gordo_project, metadata, ml_server): data_provider = providers.RandomDataProvider(min_size=10) client = Client(project=gordo_project, data_provider=data_provider) start = isoparse("2016-01-01T00:00:00+00:00") end = isoparse("2016-01-01T12:00:00+00:00") machine = Machine(**metadata) assert type(machine.dataset) is TimeSeriesDataset machine.dataset.row_filter_buffer_size = 12 machine.dataset.n_samples_threshold = 10 dataset = client._get_dataset(machine, start, end) assert dataset.row_filter_buffer_size == 0 assert dataset.n_samples_threshold == 0 assert dataset.low_threshold is None assert dataset.high_threshold is None
def test_mlflow_context_log_error(MockClient, metadata): """ Test that an error while logging metadata as an artifact raises MlflowLoggingError """ metadata = Machine(**metadata) mock_client = MockClient() mock_client.log_artifacts.side_effect = Exception( "Some unknown exception!") with pytest.raises(mlu.MlflowLoggingError): with mlu.mlflow_context("returns metadata", "unique_key", {}, {}) as ( mlflow_client, run_id, ): mlu.log_machine(mlflow_client, run_id, metadata)
def test_provide_saved_model_simple_happy_path(tmpdir): """ Test provide_saved_model with no caching """ model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir = os.path.join(tmpdir, "model") machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) ModelBuilder(machine).build(output_dir=output_dir) # Assert the model was saved at the location # Should be model file, and the metadata assert len(os.listdir(output_dir)) == 2
def test_output_scores_metadata(): data_config = get_random_data() raw_model_config = f""" gordo.machine.model.anomaly.diff.DiffBasedAnomalyDetector: scaler: sklearn.preprocessing.MinMaxScaler base_estimator: sklearn.compose.TransformedTargetRegressor: transformer: sklearn.preprocessing.MinMaxScaler regressor: sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.MinMaxScaler - gordo.machine.model.models.KerasAutoEncoder: kind: feedforward_hourglass batch_size: 3 compression_factor: 0.5 encoding_layers: 1 func: tanh out_func: linear epochs: 1 """ model_config = yaml.load(raw_model_config, Loader=yaml.FullLoader) machine = Machine(name="model-name", dataset=data_config, model=model_config, project_name="test") model, machine_out = ModelBuilder(machine).build() scores_metadata = machine_out.metadata.build_metadata.model.cross_validation.scores assert (scores_metadata["explained-variance-score-Tag-1"]["fold-mean"] + scores_metadata["explained-variance-score-Tag-2"]["fold-mean"] ) / 2 == pytest.approx( scores_metadata["explained-variance-score"]["fold-mean"]) assert ( scores_metadata["r2-score-Tag-1"]["fold-mean"] + scores_metadata["r2-score-Tag-2"]["fold-mean"]) / 2 == pytest.approx( scores_metadata["r2-score"]["fold-mean"]) assert (scores_metadata["mean-squared-error-Tag-1"]["fold-mean"] + scores_metadata["mean-squared-error-Tag-2"]["fold-mean"] ) / 2 == pytest.approx( scores_metadata["mean-squared-error"]["fold-mean"]) assert (scores_metadata["mean-absolute-error-Tag-1"]["fold-mean"] + scores_metadata["mean-absolute-error-Tag-2"]["fold-mean"] ) / 2 == pytest.approx( scores_metadata["mean-absolute-error"]["fold-mean"])
def test_provide_saved_model_caching_handle_existing_same_dir(tmpdir): """If the model exists in the model register, and the path there is the same as output_dir, output_dir is returned""" model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir = os.path.join(tmpdir, "model") registry_dir = os.path.join(tmpdir, "registry") machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) builder = ModelBuilder(machine) builder.build(output_dir=output_dir, model_register_dir=registry_dir) assert builder.cached_model_path == output_dir # Saving to same output_dir as the one saved in the registry just returns the output_dir builder.build(output_dir=output_dir, model_register_dir=registry_dir) assert builder.cached_model_path == output_dir
def test_mlflow_context_log_metadata(MockClient, tmpdir, metadata): """ Test that call to wrapped function initiates MLflow logging or throws warning """ metadata = Machine(**metadata) mlflow.set_tracking_uri(f"file:{tmpdir}") mock_client = MockClient() mock_client.log_batch.return_value = "test" # Function with a metadata dict returned with mlu.mlflow_context("returns metadata", "unique_key", {}, {}) as ( mlflow_client, run_id, ): mlu.log_machine(mlflow_client, run_id, metadata) assert mock_client.log_batch.called
def test_output_dir(tmpdir): """ Test building of model will create subdirectories for model saving if needed. """ model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir = os.path.join(tmpdir, "some", "sub", "directories") machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) builder = ModelBuilder(machine) model, machine_out = builder.build() machine_check(machine_out, False) builder._save_model(model=model, machine=machine_out, output_dir=output_dir) # Assert the model was saved at the location # Should be model file, and the metadata assert len(os.listdir(output_dir)) == 2
def test_provide_saved_model_caching_handle_existing_different_register(tmpdir): """If the model exists in the model register, but the output_dir is not where the model is, the model is copied to the new location, unless the new location already exists. If it does then return it""" model_config = {"sklearn.decomposition.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir1 = os.path.join(tmpdir, "model1") output_dir2 = os.path.join(tmpdir, "model2") registry_dir = os.path.join(tmpdir, "registry") machine = Machine( name="model-name", dataset=data_config, model=model_config, project_name="test" ) builder = ModelBuilder(machine) builder.build(output_dir=output_dir1, model_register_dir=registry_dir) builder.build(output_dir=output_dir2, model_register_dir=registry_dir) assert builder.cached_model_path == output_dir2 builder.build(output_dir=output_dir2, model_register_dir=registry_dir) assert builder.cached_model_path == output_dir2
def _build(self) -> Tuple[sklearn.base.BaseEstimator, Machine]: """ Build the model using the current state of the Builder Returns ------- Tuple[sklearn.base.BaseEstimator, dict] """ # Enforce random seed to 0 if not specified. self.set_seed(seed=self.machine.evaluation.get("seed", 0)) # Get the dataset from config logger.debug( f"Initializing Dataset with config {self.machine.dataset.to_dict()}" ) dataset = _get_dataset(self.machine.dataset.to_dict()) logger.debug("Fetching training data") start = time.time() X, y = dataset.get_data() time_elapsed_data = time.time() - start # Get the model and dataset logger.debug(f"Initializing Model with config: {self.machine.model}") model = serializer.from_definition(self.machine.model) cv_duration_sec = None machine: Machine = Machine( name=self.machine.name, dataset=self.machine.dataset.to_dict(), metadata=self.machine.metadata, model=self.machine.model, project_name=self.machine.project_name, evaluation=self.machine.evaluation, runtime=self.machine.runtime, ) split_metadata: Dict[str, Any] = dict() scores: Dict[str, Any] = dict() if self.machine.evaluation["cv_mode"].lower() in ( "cross_val_only", "full_build", ): # Build up a metrics list. metrics_list = self.metrics_from_list( self.machine.evaluation.get("metrics")) # Cross validate if hasattr(model, "predict"): logger.debug("Starting cross validation") start = time.time() scaler = self.machine.evaluation.get("scoring_scaler") metrics_dict = self.build_metrics_dict(metrics_list, y, scaler=scaler) split_obj = serializer.from_definition( self.machine.evaluation.get( "cv", { "sklearn.model_selection.TimeSeriesSplit": { "n_splits": 3 } }, )) # Generate metadata about CV train, test splits split_metadata = ModelBuilder.build_split_dict(X, split_obj) cv_kwargs = dict(X=X, y=y, scoring=metrics_dict, return_estimator=True, cv=split_obj) if hasattr(model, "cross_validate"): cv = model.cross_validate(**cv_kwargs) else: cv = cross_validate(model, **cv_kwargs) for metric, test_metric in map(lambda k: (k, f"test_{k}"), metrics_dict): val = { "fold-mean": cv[test_metric].mean(), "fold-std": cv[test_metric].std(), "fold-max": cv[test_metric].max(), "fold-min": cv[test_metric].min(), } val.update({ f"fold-{i + 1}": raw_value for i, raw_value in enumerate(cv[test_metric].tolist()) }) scores.update({metric: val}) cv_duration_sec = time.time() - start else: logger.debug( "Unable to score model, has no attribute 'predict'.") # If cross_val_only, return without fitting to the whole dataset if self.machine.evaluation["cv_mode"] == "cross_val_only": machine.metadata.build_metadata = BuildMetadata( model=ModelBuildMetadata( cross_validation=CrossValidationMetaData( cv_duration_sec=cv_duration_sec, scores=scores, splits=split_metadata, )), dataset=DatasetBuildMetadata( query_duration_sec=time_elapsed_data, dataset_meta=dataset.get_metadata(), ), ) return model, machine # Train logger.debug("Starting to train model.") start = time.time() model.fit(X, y) time_elapsed_model = time.time() - start # Build specific metadata machine.metadata.build_metadata = BuildMetadata( model=ModelBuildMetadata( model_offset=self._determine_offset(model, X), model_creation_date=str( datetime.datetime.now(datetime.timezone.utc).astimezone()), model_builder_version=__version__, model_training_duration_sec=time_elapsed_model, cross_validation=CrossValidationMetaData( cv_duration_sec=cv_duration_sec, scores=scores, splits=split_metadata, ), model_meta=self._extract_metadata_from_model(model), ), dataset=DatasetBuildMetadata( query_duration_sec=time_elapsed_data, dataset_meta=dataset.get_metadata(), ), ) return model, machine
def test_provide_saved_model_caching( should_be_equal: bool, metadata: Optional[Metadata], tag_list: Optional[List[SensorTag]], replace_cache, tmpdir, ): """ Test provide_saved_model with caching and possible cache busting if tag_list, or replace_cache is set. Builds two models and checks if their model-creation-date's are the same, which will be if and only if there is caching. Parameters ---------- should_be_equal : bool Do we expect the two generated models to be at the same location or not? I.e. do we expect caching. metadata: Metadata Optional metadata which will be used as metadata for the second model. tag_list Optional list of strings which be used as the taglist in the dataset for the second model. replace_cache: bool Should we force a model cache replacement? """ if tag_list is None: tag_list = [] if metadata is None: metadata = Metadata() model_config = {"sklearn.decomposition.pca.PCA": {"svd_solver": "auto"}} data_config = get_random_data() output_dir = os.path.join(tmpdir, "model") registry_dir = os.path.join(tmpdir, "registry") machine = Machine(name="model-name", dataset=data_config, model=model_config, project_name="test") _, first_machine = ModelBuilder(machine).build( output_dir=output_dir, model_register_dir=registry_dir) if tag_list: data_config["tag_list"] = tag_list new_output_dir = os.path.join(tmpdir, "model2") _, second_machine = ModelBuilder(machine=Machine( name="model-name", dataset=data_config, model=model_config, metadata=metadata, project_name="test", runtime={"something": True}, )).build( output_dir=new_output_dir, model_register_dir=registry_dir, replace_cache=replace_cache, ) model1_creation_date = ( first_machine.metadata.build_metadata.model.model_creation_date) model2_creation_date = ( second_machine.metadata.build_metadata.model.model_creation_date) assert "something" in second_machine.runtime if should_be_equal: assert model1_creation_date == model2_creation_date else: assert model1_creation_date != model2_creation_date if metadata is not None: assert metadata.user_defined == second_machine.metadata.user_defined
def build( self, output_dir: Optional[Union[os.PathLike, str]] = None, model_register_dir: Optional[Union[os.PathLike, str]] = None, replace_cache=False, ) -> Tuple[sklearn.base.BaseEstimator, Machine]: """ Always return a model and its metadata. If ``output_dir`` is supplied, it will save the model there. ``model_register_dir`` points to the model cache directory which it will attempt to read the model from. Supplying both will then have the effect of both; reading from the cache and saving that cached model to the new output directory. Parameters ---------- output_dir: Optional[Union[os.PathLike, str]] A path to where the model will be deposited. model_register_dir: Optional[Union[os.PathLike, str]] A path to a register, see `:func:gordo.util.disk_registry`. If this is None then always build the model, otherwise try to resolve the model from the registry. replace_cache: bool Forces a rebuild of the model, and replaces the entry in the cache with the new model. Returns ------- Tuple[sklearn.base.BaseEstimator, Machine] Built model and an updated ``Machine`` """ if not model_register_dir: model, machine = self._build() else: logger.debug( f"Model caching activated, attempting to read model-location with key " f"{self.cache_key} from register {model_register_dir}") self.cached_model_path = self.check_cache(model_register_dir) if replace_cache: logger.info( "replace_cache=True, deleting any existing cache entry") disk_registry.delete_value(model_register_dir, self.cache_key) self.cached_model_path = None # Load the model from previous cached directory if self.cached_model_path: model = serializer.load(self.cached_model_path) metadata = serializer.load_metadata(self.cached_model_path) metadata["metadata"][ "user_defined"] = self.machine.metadata.user_defined metadata["runtime"] = self.machine.runtime machine = Machine(**metadata) # Otherwise build and cache the model else: model, machine = self._build() self.cached_model_path = self._save_model( model=model, machine=machine, output_dir=output_dir # type: ignore ) logger.info( f"Built model, and deposited at {self.cached_model_path}") logger.info(f"Writing model-location to model registry") disk_registry.write_key( # type: ignore model_register_dir, self.cache_key, self.cached_model_path) # Save model to disk, if we're not building for cv only purposes. if output_dir and (self.machine.evaluation.get("cv_mode") != "cross_val_only"): self.cached_model_path = self._save_model(model=model, machine=machine, output_dir=output_dir) return model, machine