def test_into_definition(variations_of_same_pipeline): expected_definition = """ sklearn.pipeline.Pipeline: memory: null steps: - sklearn.decomposition._pca.PCA: copy: true iterated_power: auto n_components: 2 random_state: null svd_solver: auto tol: 0.0 whiten: false - sklearn.pipeline.FeatureUnion: n_jobs: null transformer_list: - sklearn.decomposition._pca.PCA: copy: true iterated_power: auto n_components: 3 random_state: null svd_solver: auto tol: 0.0 whiten: false - sklearn.pipeline.Pipeline: memory: null steps: - sklearn.preprocessing._data.MinMaxScaler: copy: true feature_range: - 0 - 1 - sklearn.decomposition._truncated_svd.TruncatedSVD: algorithm: randomized n_components: 2 n_iter: 5 random_state: null tol: 0.0 verbose: false transformer_weights: null verbose: false - gordo.machine.model.models.KerasAutoEncoder: kind: feedforward_hourglass verbose: false """ expected_definition = yaml.safe_load(expected_definition) for pipe in variations_of_same_pipeline: definition = into_definition(from_definition(into_definition(pipe))) assert json.dumps(definition) == json.dumps( expected_definition ), f"Failed output:\n{definition}\nExpected:----------------\n{expected_definition}"
def test_diff_detector_serializability(config): """ Should play well with the gordo serializer """ config = yaml.load(config) model = serializer.from_definition(config) serializer.into_definition(model) serialized_bytes = serializer.dumps(model) serializer.loads(serialized_bytes)
def test_from_definition_test_model(): model = DefinitionTestModel(300) definition = into_definition(model) expected_definition = { "tests.gordo.serializer.definition_test_model.DefinitionTestModel": { "depth": 300 } } assert expected_definition == definition
def to_dict(self) -> dict: """ Serialize this object into a dict representation, which can be used to initialize a new object after popping 'type' from the dict. Returns ------- dict """ return serializer.into_definition(self)
def test_imputer_from_definition(config_str: str): """ Ensure it plays well with the gordo serializer """ config = yaml.safe_load(config_str) model = serializer.from_definition(config) if isinstance(model, Pipeline): assert isinstance(model.steps[-1][1], InfImputer) else: assert isinstance(model, InfImputer) serializer.from_definition(serializer.into_definition(model))
def test_captures_kwarg_to_init(): """ Our models allow kwargs which are put into the underlying keras model or to construct the underlying model. We want to ensure into defintion captures kwargs which are part of the model parameters but not part of the __init__ signature """ ae = KerasAutoEncoder(kind="feedforward_hourglass", some_fancy_param="Howdy!") definition = into_definition(ae) parameters = definition[ f"{KerasAutoEncoder.__module__}.{KerasAutoEncoder.__name__}"] assert "some_fancy_param" in parameters assert parameters["some_fancy_param"] == "Howdy!" # And make sure we can init again KerasAutoEncoder(**parameters)
def test_into_from(): """ Pass Pipeline into definition, and then from that definition """ from gordo.machine.model.transformer_funcs.general import multiply_by factories = register_model_builder.factories for model in factories.keys(): for model_kind in factories[model].keys(): pipe = Pipeline([ ("step_0", PCA(n_components=2)), ( "step_1", FeatureUnion([ ("step_0", PCA(n_components=3)), ( "step_1", Pipeline(steps=[ ("step_0", MinMaxScaler((0, 1))), ("step_1", TruncatedSVD(n_components=2)), ]), ), ]), ), ( "step_2", FunctionTransformer(func=multiply_by, kw_args={"factor": 1}), ), ( "step_3", pydoc.locate(f"gordo.machine.model.models.{model}")( kind=model_kind), ), ]) from_definition(into_definition(pipe))
def build( machine_config: dict, output_dir: str, model_register_dir: click.Path, print_cv_scores: bool, model_parameter: List[Tuple[str, Any]], exceptions_reporter_file: str, exceptions_report_level: str, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- machine_config: dict A dict loadable by :class:`gordo.machine.Machine.from_config` output_dir: str Directory to save model & metadata to. model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple[str, Any] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. exceptions_reporter_file: str JSON output file for exception information exceptions_report_level: str Details level for exception reporting """ try: if model_parameter and isinstance(machine_config["model"], str): parameters = dict(model_parameter) # convert lib of tuples to dict machine_config["model"] = expand_model(machine_config["model"], parameters) machine: Machine = Machine.from_config( machine_config, project_name=machine_config["project_name"]) logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Register dir: {model_register_dir}") # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") machine.model = serializer.into_definition( serializer.from_definition(machine.model)) logger.info(f"Fully expanded model config: {machine.model}") builder = ModelBuilder(machine=machine) _, machine_out = builder.build(output_dir, model_register_dir) # type: ignore logger.debug("Reporting built machine.") machine_out.report() logger.debug("Finished reporting.") if "err" in machine.name: raise FileNotFoundError("undefined_file.parquet") if print_cv_scores: for score in get_all_score_strings(machine_out): print(score) except Exception: traceback.print_exc() exc_type, exc_value, exc_traceback = sys.exc_info() exit_code = _exceptions_reporter.exception_exit_code(exc_type) if exceptions_reporter_file: _exceptions_reporter.safe_report( cast( ReportLevel, ReportLevel.get_by_name(exceptions_report_level, ReportLevel.EXIT_CODE), ), exc_type, exc_value, exc_traceback, exceptions_reporter_file, max_message_len=2024 - 500, ) sys.exit(exit_code) else: return 0
def test_from_into(): """ Create pipeline from definition, and create from that definition """ factories = register_model_builder.factories for model in factories.keys(): for model_kind in factories[model].keys(): definition = f""" sklearn.pipeline.Pipeline: steps: - sklearn.decomposition.PCA: n_components: 2 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.preprocessing._function_transformer.FunctionTransformer: func: gordo.machine.model.transformer_funcs.general.multiply_by kw_args: factor: 1 inverse_func: gordo.machine.model.transformer_funcs.general.multiply_by inv_kw_args: factor: 1 - sklearn.pipeline.FeatureUnion: transformer_list: - sklearn.decomposition.PCA: n_components: 3 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.MinMaxScaler: feature_range: - 0 - 1 copy: true - sklearn.decomposition.truncated_svd.TruncatedSVD: n_components: 2 algorithm: randomized n_iter: 5 random_state: tol: 0.0 memory: verbose: false n_jobs: 1 transformer_weights: verbose: false - gordo.machine.model.models.{model}: kind: {model_kind} memory: verbose: false """ definition = yaml.safe_load(definition) pipe = from_definition(definition) into_definition(pipe)
def build( machine_config: dict, output_dir: str, model_register_dir: click.Path, print_cv_scores: bool, model_parameter: List[Tuple[str, Any]], ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- machine_config: dict A dict loadable by :class:`gordo.machine.Machine.from_config` output_dir: str Directory to save model & metadata to. model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple[str, Any] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. """ if model_parameter and isinstance(machine_config["model"], str): parameters = dict(model_parameter) # convert lib of tuples to dict machine_config["model"] = expand_model(machine_config["model"], parameters) machine: Machine = Machine.from_config( machine_config, project_name=machine_config["project_name"]) logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Register dir: {model_register_dir}") # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") machine.model = serializer.into_definition( serializer.from_definition(machine.model)) logger.info(f"Fully expanded model config: {machine.model}") builder = ModelBuilder(machine=machine) try: _, machine_out = builder.build(output_dir, model_register_dir) # type: ignore logger.debug("Reporting built machine.") machine_out.report() logger.debug("Finished reporting.") if print_cv_scores: for score in get_all_score_strings(machine_out): print(score) except Exception as e: exit_code = EXCEPTION_TO_EXITCODE.get(e.__class__, 1) traceback.print_exc() sys.exit(exit_code) else: return 0