def __set__(self, instance, value): if getattr(instance, "_strict", True): try: from_definition(value) except Exception as e: raise ValueError(f"Pipeline from definition failed: {e}") instance.__dict__[self.name] = value
def test_validation_error(): config = """ os.rmdir: path: / """ definition = yaml.load(config) with pytest.raises(ValueError): serializer.from_definition(definition)
def test_imputer_from_definition(config_str: str): """ Ensure it plays well with the gordo serializer """ config = yaml.safe_load(config_str) model = serializer.from_definition(config) if isinstance(model, Pipeline): assert isinstance(model.steps[-1][1], InfImputer) else: assert isinstance(model, InfImputer) serializer.from_definition(serializer.into_definition(model))
def __call__(self): """Build Keras model from specification""" if not all(k in self.kind for k in self._expected_keys): raise ValueError( f"Expected spec to have keys: {self._expected_keys}, but found {self.kind.keys()}" ) logger.debug(f"Building model from spec: {self.kind}") model = serializer.from_definition(self.kind["spec"]) # Load any compile kwargs as well, such as compile.optimizer which may map to class obj kwargs = serializer.from_definition(self.kind["compile"]) model.compile(**kwargs) return model
def test_raw_keras_part_of_pipeline(): """ It should play well, when tucked into a sklearn.pipeline.Pipeline """ X, y = np.random.random((100, 4)), np.random.random((100, 1)) config_str = """ sklearn.pipeline.Pipeline: steps: - sklearn.decomposition.pca.PCA: n_components: 4 - gordo.machine.model.models.KerasRawModelRegressor: kind: compile: loss: mse optimizer: adam spec: tensorflow.keras.models.Sequential: layers: - tensorflow.keras.layers.Dense: units: 4 - tensorflow.keras.layers.Dense: units: 1 """ config = yaml.safe_load(config_str) pipe = serializer.from_definition(config) assert isinstance(pipe, Pipeline) pipe.fit(X, y) out = pipe.predict(X) assert len(out) == len(y)
def test_from_definition_test_model(): config = """ tests.gordo.serializer.definition_test_model.DefinitionTestModel: depth: "300" """ definition = yaml.load(config) model = serializer.from_definition(definition) assert type(model) == DefinitionTestModel assert model.depth == 300
def test_into_definition(variations_of_same_pipeline): expected_definition = """ sklearn.pipeline.Pipeline: memory: null steps: - sklearn.decomposition._pca.PCA: copy: true iterated_power: auto n_components: 2 random_state: null svd_solver: auto tol: 0.0 whiten: false - sklearn.pipeline.FeatureUnion: n_jobs: null transformer_list: - sklearn.decomposition._pca.PCA: copy: true iterated_power: auto n_components: 3 random_state: null svd_solver: auto tol: 0.0 whiten: false - sklearn.pipeline.Pipeline: memory: null steps: - sklearn.preprocessing._data.MinMaxScaler: copy: true feature_range: - 0 - 1 - sklearn.decomposition._truncated_svd.TruncatedSVD: algorithm: randomized n_components: 2 n_iter: 5 random_state: null tol: 0.0 verbose: false transformer_weights: null verbose: false - gordo.machine.model.models.KerasAutoEncoder: kind: feedforward_hourglass verbose: false """ expected_definition = yaml.safe_load(expected_definition) for pipe in variations_of_same_pipeline: definition = into_definition(from_definition(into_definition(pipe))) assert json.dumps(definition) == json.dumps( expected_definition ), f"Failed output:\n{definition}\nExpected:----------------\n{expected_definition}"
def test_diff_detector_serializability(config): """ Should play well with the gordo serializer """ config = yaml.load(config) model = serializer.from_definition(config) serializer.into_definition(model) serialized_bytes = serializer.dumps(model) serializer.loads(serialized_bytes)
def test_load_from_definition(definition): """ Ensure serializer can load models which take other models as parameters. """ X, y = np.random.random((10, 10)), np.random.random((10, 2)) definition = yaml.load(definition, Loader=yaml.SafeLoader) model = serializer.from_definition(definition) assert isinstance(model, MultiOutputRegressor) model.fit(X, y) model.predict(X)
def test_into_from(): """ Pass Pipeline into definition, and then from that definition """ from gordo.machine.model.transformer_funcs.general import multiply_by factories = register_model_builder.factories for model in factories.keys(): for model_kind in factories[model].keys(): pipe = Pipeline([ ("step_0", PCA(n_components=2)), ( "step_1", FeatureUnion([ ("step_0", PCA(n_components=3)), ( "step_1", Pipeline(steps=[ ("step_0", MinMaxScaler((0, 1))), ("step_1", TruncatedSVD(n_components=2)), ]), ), ]), ), ( "step_2", FunctionTransformer(func=multiply_by, kw_args={"factor": 1}), ), ( "step_3", pydoc.locate(f"gordo.machine.model.models.{model}")( kind=model_kind), ), ]) from_definition(into_definition(pipe))
def test_from_definition(self): for raw_yaml, model, model_kind in self.setup_gen(): self.assertTrue(model) logger.info(raw_yaml) config = yaml.load(raw_yaml) logger.debug("{}".format(config)) config_clone = copy.deepcopy(config) # To ensure no mutation occurs pipe = from_definition(config) # Test that the original config matches the one passed; no mutation self.assertEqual(config, config_clone) # Special tests that defining non-default argument holds for a # 'key: ' is evaled to 'key=None' if "memory: /tmp" in raw_yaml: self.assertEqual(pipe.steps[2][1].transformer_list[1][1].memory, "/tmp") self._verify_pipe(pipe, model, model_kind)
def _build(self) -> Tuple[sklearn.base.BaseEstimator, Machine]: """ Build the model using the current state of the Builder Returns ------- Tuple[sklearn.base.BaseEstimator, dict] """ # Enforce random seed to 0 if not specified. self.set_seed(seed=self.machine.evaluation.get("seed", 0)) # Get the dataset from config logger.debug( f"Initializing Dataset with config {self.machine.dataset.to_dict()}" ) dataset = _get_dataset(self.machine.dataset.to_dict()) logger.debug("Fetching training data") start = time.time() X, y = dataset.get_data() time_elapsed_data = time.time() - start # Get the model and dataset logger.debug(f"Initializing Model with config: {self.machine.model}") model = serializer.from_definition(self.machine.model) cv_duration_sec = None machine: Machine = Machine( name=self.machine.name, dataset=self.machine.dataset.to_dict(), metadata=self.machine.metadata, model=self.machine.model, project_name=self.machine.project_name, evaluation=self.machine.evaluation, runtime=self.machine.runtime, ) split_metadata: Dict[str, Any] = dict() scores: Dict[str, Any] = dict() if self.machine.evaluation["cv_mode"].lower() in ( "cross_val_only", "full_build", ): # Build up a metrics list. metrics_list = self.metrics_from_list( self.machine.evaluation.get("metrics")) # Cross validate if hasattr(model, "predict"): logger.debug("Starting cross validation") start = time.time() scaler = self.machine.evaluation.get("scoring_scaler") metrics_dict = self.build_metrics_dict(metrics_list, y, scaler=scaler) split_obj = serializer.from_definition( self.machine.evaluation.get( "cv", { "sklearn.model_selection.TimeSeriesSplit": { "n_splits": 3 } }, )) # Generate metadata about CV train, test splits split_metadata = ModelBuilder.build_split_dict(X, split_obj) cv_kwargs = dict(X=X, y=y, scoring=metrics_dict, return_estimator=True, cv=split_obj) if hasattr(model, "cross_validate"): cv = model.cross_validate(**cv_kwargs) else: cv = cross_validate(model, **cv_kwargs) for metric, test_metric in map(lambda k: (k, f"test_{k}"), metrics_dict): val = { "fold-mean": cv[test_metric].mean(), "fold-std": cv[test_metric].std(), "fold-max": cv[test_metric].max(), "fold-min": cv[test_metric].min(), } val.update({ f"fold-{i + 1}": raw_value for i, raw_value in enumerate(cv[test_metric].tolist()) }) scores.update({metric: val}) cv_duration_sec = time.time() - start else: logger.debug( "Unable to score model, has no attribute 'predict'.") # If cross_val_only, return without fitting to the whole dataset if self.machine.evaluation["cv_mode"] == "cross_val_only": machine.metadata.build_metadata = BuildMetadata( model=ModelBuildMetadata( cross_validation=CrossValidationMetaData( cv_duration_sec=cv_duration_sec, scores=scores, splits=split_metadata, )), dataset=DatasetBuildMetadata( query_duration_sec=time_elapsed_data, dataset_meta=dataset.get_metadata(), ), ) return model, machine # Train logger.debug("Starting to train model.") start = time.time() model.fit(X, y) time_elapsed_model = time.time() - start # Build specific metadata machine.metadata.build_metadata = BuildMetadata( model=ModelBuildMetadata( model_offset=self._determine_offset(model, X), model_creation_date=str( datetime.datetime.now(datetime.timezone.utc).astimezone()), model_builder_version=__version__, model_training_duration_sec=time_elapsed_model, cross_validation=CrossValidationMetaData( cv_duration_sec=cv_duration_sec, scores=scores, splits=split_metadata, ), model_meta=self._extract_metadata_from_model(model), ), dataset=DatasetBuildMetadata( query_duration_sec=time_elapsed_data, dataset_meta=dataset.get_metadata(), ), ) return model, machine
def build( machine_config: dict, output_dir: str, model_register_dir: click.Path, print_cv_scores: bool, model_parameter: List[Tuple[str, Any]], ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- machine_config: dict A dict loadable by :class:`gordo.machine.Machine.from_config` output_dir: str Directory to save model & metadata to. model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple[str, Any] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. """ if model_parameter and isinstance(machine_config["model"], str): parameters = dict(model_parameter) # convert lib of tuples to dict machine_config["model"] = expand_model(machine_config["model"], parameters) machine: Machine = Machine.from_config( machine_config, project_name=machine_config["project_name"]) logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Register dir: {model_register_dir}") # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") machine.model = serializer.into_definition( serializer.from_definition(machine.model)) logger.info(f"Fully expanded model config: {machine.model}") builder = ModelBuilder(machine=machine) try: _, machine_out = builder.build(output_dir, model_register_dir) # type: ignore logger.debug("Reporting built machine.") machine_out.report() logger.debug("Finished reporting.") if print_cv_scores: for score in get_all_score_strings(machine_out): print(score) except Exception as e: exit_code = EXCEPTION_TO_EXITCODE.get(e.__class__, 1) traceback.print_exc() sys.exit(exit_code) else: return 0
def build( machine_config: dict, output_dir: str, model_register_dir: click.Path, print_cv_scores: bool, model_parameter: List[Tuple[str, Any]], exceptions_reporter_file: str, exceptions_report_level: str, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- machine_config: dict A dict loadable by :class:`gordo.machine.Machine.from_config` output_dir: str Directory to save model & metadata to. model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple[str, Any] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. exceptions_reporter_file: str JSON output file for exception information exceptions_report_level: str Details level for exception reporting """ try: if model_parameter and isinstance(machine_config["model"], str): parameters = dict(model_parameter) # convert lib of tuples to dict machine_config["model"] = expand_model(machine_config["model"], parameters) machine: Machine = Machine.from_config( machine_config, project_name=machine_config["project_name"]) logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Register dir: {model_register_dir}") # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") machine.model = serializer.into_definition( serializer.from_definition(machine.model)) logger.info(f"Fully expanded model config: {machine.model}") builder = ModelBuilder(machine=machine) _, machine_out = builder.build(output_dir, model_register_dir) # type: ignore logger.debug("Reporting built machine.") machine_out.report() logger.debug("Finished reporting.") if "err" in machine.name: raise FileNotFoundError("undefined_file.parquet") if print_cv_scores: for score in get_all_score_strings(machine_out): print(score) except Exception: traceback.print_exc() exc_type, exc_value, exc_traceback = sys.exc_info() exit_code = _exceptions_reporter.exception_exit_code(exc_type) if exceptions_reporter_file: _exceptions_reporter.safe_report( cast( ReportLevel, ReportLevel.get_by_name(exceptions_report_level, ReportLevel.EXIT_CODE), ), exc_type, exc_value, exc_traceback, exceptions_reporter_file, max_message_len=2024 - 500, ) sys.exit(exit_code) else: return 0
def from_dict(cls, config: Dict[str, Any]) -> "BaseReporter": """ Reconstruct the reporter from a dict representation or a single import path if it doesn't require any init parameters. """ return serializer.from_definition(config)
def test_from_into(): """ Create pipeline from definition, and create from that definition """ factories = register_model_builder.factories for model in factories.keys(): for model_kind in factories[model].keys(): definition = f""" sklearn.pipeline.Pipeline: steps: - sklearn.decomposition.PCA: n_components: 2 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.preprocessing._function_transformer.FunctionTransformer: func: gordo.machine.model.transformer_funcs.general.multiply_by kw_args: factor: 1 inverse_func: gordo.machine.model.transformer_funcs.general.multiply_by inv_kw_args: factor: 1 - sklearn.pipeline.FeatureUnion: transformer_list: - sklearn.decomposition.PCA: n_components: 3 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.MinMaxScaler: feature_range: - 0 - 1 copy: true - sklearn.decomposition.truncated_svd.TruncatedSVD: n_components: 2 algorithm: randomized n_iter: 5 random_state: tol: 0.0 memory: verbose: false n_jobs: 1 transformer_weights: verbose: false - gordo.machine.model.models.{model}: kind: {model_kind} memory: verbose: false """ definition = yaml.safe_load(definition) pipe = from_definition(definition) into_definition(pipe)
def build_metrics_dict( metrics_list: list, y: pd.DataFrame, scaler: Optional[Union[TransformerMixin, str]] = None, ) -> dict: """ Given a list of metrics that accept a true_y and pred_y as inputs this returns a dictionary with keys in the form '{score}-{tag_name}' for each given target tag and '{score}' for the average score across all target tags and folds, and values being the callable make_scorer(metric_wrapper(score)). Note: score in {score}-{tag_name} is a sklearn's score function name with '_' replaced by '-' and tag_name corresponds to given target tag name with ' ' replaced by '-'. Parameters ---------- metrics_list: list List of sklearn score functions y: pd.DataFrame Target data scaler : Optional[Union[TransformerMixin, str]] Scaler which will be fitted on y, and used to transform the data before scoring. Useful when the metrics are sensitive to the amplitude of the data, and you have multiple targets. Returns ------- dict """ if scaler: if isinstance(scaler, str) or isinstance(scaler, dict): scaler = serializer.from_definition(scaler) logger.debug("Fitting scaler for scoring purpose") scaler.fit(y) def _score_factory(metric_func=metrics.r2_score, col_index=0): def _score_per_tag(y_true, y_pred): # This function extracts the score for each given target_tag to # use as scoring argument in sklearn cross_validate, as the scoring # must return a single value. if hasattr(y_true, "values"): y_true = y_true.values if hasattr(y_pred, "values"): y_pred = y_pred.values return metric_func(y_true[:, col_index], y_pred[:, col_index]) return _score_per_tag metrics_dict = {} for metric in metrics_list: for index, col in enumerate(y.columns): metric_str = metric.__name__.replace("_", "-") metrics_dict.update({ metric_str + f'-{col.replace(" ", "-")}': metrics.make_scorer( metric_wrapper( _score_factory(metric_func=metric, col_index=index), scaler=scaler, )) }) metrics_dict.update({ metric_str: metrics.make_scorer(metric_wrapper(metric, scaler=scaler)) }) return metrics_dict