def test_build_cv_mode_cross_val_cache( tmpdir, should_save_model: bool, cv_mode_1: str, cv_mode_2: str, runner: CliRunner, machine: Machine, ): """ Checks that cv_scores uses cache if ran after a full build. Loads the same model, and can print the cv_scores from them. """ logger.info(f"MODEL_CONFIG={json.dumps(machine.model)}") machine.evaluation = cv_mode_1 # type: ignore with temp_env_vars(MACHINE=json.dumps(machine.to_dict()), OUTPUT_DIR=str(tmpdir)): runner.invoke(cli.gordo, ["build"]) machine.evaluation = cv_mode_2 # type: ignore with temp_env_vars(MACHINE=json.dumps(machine.to_dict()), OUTPUT_DIR=str(tmpdir)): runner.invoke(cli.gordo, ["build"]) if should_save_model: assert len(os.listdir(tmpdir)) > 0 else: assert len(os.listdir(tmpdir)) == 0
def log_machine(mlflow_client: MlflowClient, run_id: str, machine: Machine): """ Send logs to configured MLflow backend Parameters ---------- mlflow_client: MlflowClient Client instance to call logging methods from. run_id: str Unique ID off MLflow Run to log to. machine: Machine Machine to log with MlflowClient. """ # Log machine metrics and params for batch_kwargs in batch_log_items(*get_machine_log_items(machine)): mlflow_client.log_batch(run_id, **batch_kwargs) # Send configs as JSON artifacts try: with tempfile.TemporaryDirectory(dir="./") as tmp_dir: fp = os.path.join(tmp_dir, f"metadata.json") with open(fp, "w") as fh: json.dump(machine.to_dict(), fh, cls=MachineEncoder) mlflow_client.log_artifacts(run_id=run_id, local_dir=tmp_dir) # Map to MlflowLoggingError for coding errors in the model builder except Exception as e: raise MlflowLoggingError(e)
def report(self, machine: GordoMachine): """ Log a machine to Postgres where top level keys, 'name', 'dataset', 'model', and 'metadata' mappings to BinaryJSON fields. Parameters ---------- machine: gordo.machine.Machine Returns ------- None """ try: with self.db.atomic(): logger.info( f"Inserting machine {machine.name} in sql") # type: ignore # Ensure it's serializable using MachineEncoder record = json.loads( json.dumps(machine.to_dict(), cls=MachineEncoder)) model = dict_to_model(Machine, record, ignore_unknown=True) try: Machine.get(Machine.name == machine.name) except peewee.DoesNotExist: model.save() else: query = Machine.update(**model_to_dict(model)).where( Machine.name == machine.name) query.execute() except Exception as exc: raise PostgresReporterException(exc)
def __init__(self, machine: Machine): """ Build a model for a given :class:`gordo.workflow.config_elements.machine.Machine` Parameters ---------- machine: Machine Example ------- >>> from gordo_dataset.sensor_tag import SensorTag >>> from gordo.machine import Machine >>> from gordo.dependencies import configure_once >>> configure_once() >>> machine = Machine( ... name="special-model-name", ... model={"sklearn.decomposition.PCA": {"svd_solver": "auto"}}, ... dataset={ ... "type": "RandomDataset", ... "train_start_date": "2017-12-25 06:00:00Z", ... "train_end_date": "2017-12-30 06:00:00Z", ... "tag_list": [SensorTag("Tag 1", None), SensorTag("Tag 2", None)], ... "target_tag_list": [SensorTag("Tag 3", None), SensorTag("Tag 4", None)] ... }, ... project_name='test-proj', ... ) >>> builder = ModelBuilder(machine=machine) >>> model, machine = builder.build() """ # Avoid overwriting the passed machine, copy doesn't work if it holds # reference to a loaded Tensorflow model; .to_dict() serializes it to # a primitive dict representation. self.machine = Machine(**machine.to_dict())
def test_build_cv_mode(tmpdir, runner: CliRunner, should_save_model: bool, cv_mode: str, machine: Machine): """ Testing build with cv_mode set to full and cross_val_only. Checks that cv_scores are printed and model are only saved when using the default (full) value. """ machine.model = MODEL_CONFIG_WITH_PREDICT machine.evaluation = cv_mode # type: ignore logger.info(f"MODEL_CONFIG={json.dumps(machine.model)}") tmp_model_dir = os.path.join(tmpdir, "tmp") os.makedirs(tmp_model_dir, exist_ok=True) with temp_env_vars(MACHINE=json.dumps(machine.to_dict()), OUTPUT_DIR=tmp_model_dir): result = runner.invoke(cli.gordo, ["build", "--print-cv-scores"]) assert result.exit_code == 0 # Checks that the file is empty or not depending on the mode. if should_save_model: assert len(os.listdir(tmp_model_dir)) != 0 else: assert len(os.listdir(tmp_model_dir)) == 0 # Checks the output contains 'explained-variance_raw-scores' assert "r2-score" in result.output assert "mean-squared-error" in result.output assert "mean-absolute-error" in result.output assert "explained-variance-score" in result.output
def test_client_get_dataset(gordo_project, metadata, ml_server): data_provider = providers.RandomDataProvider(min_size=10) client = Client(project=gordo_project, data_provider=data_provider) start = isoparse("2016-01-01T00:00:00+00:00") end = isoparse("2016-01-01T12:00:00+00:00") machine = Machine(**metadata) assert type(machine.dataset) is TimeSeriesDataset machine.dataset.row_filter_buffer_size = 12 machine.dataset.n_samples_threshold = 10 client_machine = ClientMachine(**machine.to_dict()) dataset = client._get_dataset(client_machine, start, end) assert dataset.row_filter_buffer_size == 0 assert dataset.n_samples_threshold == 0 assert dataset.low_threshold is None assert dataset.high_threshold is None
def test_build_cv_mode_build_only(tmpdir, runner: CliRunner, machine: Machine): """ Testing build with cv_mode set to build_only. Checks that OUTPUT_DIR gets a model saved to it. It also checks that the metadata contains cv-duration-sec=None and cv-scores={} """ logger.info(f"MODEL_CONFIG={json.dumps(machine.model)}") machine.evaluation = {"cv_mode": "build_only"} with temp_env_vars(MACHINE=json.dumps(machine.to_dict()), OUTPUT_DIR=str(tmpdir)): metadata_file = f"{os.path.join(tmpdir, 'metadata.json')}" runner.invoke(cli.gordo, ["build"]) # A model has been saved assert len(os.listdir(tmpdir)) != 0 with open(metadata_file) as f: metadata_json = json.loads(f.read()) assert (metadata_json["metadata"]["build_metadata"]["model"] ["cross_validation"]["cv_duration_sec"] is None) assert (metadata_json["metadata"]["build_metadata"]["model"] ["cross_validation"]["scores"] == {})