def test_imputer_from_definition(config_str: str): """ Ensure it plays well with the gordo serializer """ config = yaml.safe_load(config_str) model = serializer.pipeline_from_definition(config) if isinstance(model, Pipeline): assert isinstance(model.steps[-1][1], InfImputer) else: assert isinstance(model, InfImputer) serializer.pipeline_from_definition(serializer.pipeline_into_definition(model))
def test_into_from(self): """ Pass Pipeline into definition, and then from that definition """ from gordo_components.model.transformer_funcs.general import multiply_by self.factories = register_model_builder.factories for model in self.factories.keys(): for model_kind in self.factories[model].keys(): pipe = Pipeline( [ ("step_0", PCA(n_components=2)), ( "step_1", FeatureUnion( [ ("step_0", PCA(n_components=3)), ( "step_1", Pipeline( steps=[ ("step_0", MinMaxScaler((0, 1))), ( "step_1", TruncatedSVD(n_components=2), ), ] ), ), ] ), ), ( "step_2", FunctionTransformer( func=multiply_by, kw_args={"factor": 1} ), ), ( "step_3", pydoc.locate(f"gordo_components.model.models.{model}")( kind=model_kind ), ), ] ) pipeline_from_definition(pipeline_into_definition(pipe))
def trained_model_directory(sensors: List[SensorTag]): """ Fixture: Train a basic AutoEncoder and save it to a given directory will also save some metadata with the model """ with tempfile.TemporaryDirectory() as tmp_dir: definition = ruamel.yaml.load( """ sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.data.MinMaxScaler - gordo_components.model.models.KerasAutoEncoder: kind: feedforward_hourglass memory: """, Loader=ruamel.yaml.Loader, ) model = serializer.pipeline_from_definition(definition) X = np.random.random(size=len(sensors) * 10).reshape(10, len(sensors)) model.fit(X, X) serializer.dump( model, tmp_dir, metadata={ "dataset": { "tag_list": sensors, "resolution": "10T" }, "user-defined": { "model-name": "test-model", "machine-name": "machine-1", }, }, ) yield tmp_dir
def test_raw_keras_part_of_pipeline(): """ It should play well, when tucked into a sklearn.pipeline.Pipeline """ X, y = np.random.random((100, 4)), np.random.random((100, 1)) config_str = """ sklearn.pipeline.Pipeline: steps: - sklearn.decomposition.pca.PCA: n_components: 4 - gordo_components.model.models.KerasRawModelRegressor: kind: compile: loss: mse optimizer: adam spec: tensorflow.keras.models.Sequential: layers: - tensorflow.keras.layers.Dense: units: 4 - tensorflow.keras.layers.Dense: units: 1 """ config = yaml.safe_load(config_str) pipe = serializer.pipeline_from_definition(config) assert isinstance(pipe, Pipeline) pipe.fit(X, y) out = pipe.predict(X) assert len(out) == len(y)
def build_model(resampled_dataframe, epochs=5, batch_size=10): config = yaml.load( f""" sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.data.MinMaxScaler - gordo_components.model.models.KerasAutoEncoder: kind: feedforward_hourglass epochs: {epochs} batch_size: {batch_size} """ ) pipe = serializer.pipeline_from_definition(config) print("Fit model to first part of data") train_until = int(len(resampled_dataframe) / 2) model = pipe.fit(resampled_dataframe.iloc[:train_until]) print("Run data through model for prediction") predicted_data = model.predict(resampled_dataframe) # Inverse transform the model pipeline, since the autoencoders are a bit weird # with regards to their output (currently) predicted_data = model_io.get_inverse_transformed_input(model, predicted_data) anomalies = make_anomalies(resampled_dataframe, predicted_data) anomalies = pd.DataFrame(anomalies, index=resampled_dataframe.index) anomalies_mean_training = anomalies.iloc[:train_until].mean()[0] return (anomalies, anomalies_mean_training, predicted_data, train_until)
def test_from_into(self): """ Create pipeline from definition, and create from that definition """ self.factories = register_model_builder.factories for model in self.factories.keys(): for model_kind in self.factories[model].keys(): definition = f""" sklearn.pipeline.Pipeline: steps: - sklearn.decomposition.pca.PCA: n_components: 2 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.preprocessing._function_transformer.FunctionTransformer: func: gordo_components.model.transformer_funcs.general.multiply_by kw_args: factor: 1 inverse_func: gordo_components.model.transformer_funcs.general.multiply_by inv_kw_args: factor: 1 - sklearn.pipeline.FeatureUnion: transformer_list: - sklearn.decomposition.pca.PCA: n_components: 3 copy: true whiten: false svd_solver: auto tol: 0.0 iterated_power: auto random_state: - sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.data.MinMaxScaler: feature_range: - 0 - 1 copy: true - sklearn.decomposition.truncated_svd.TruncatedSVD: n_components: 2 algorithm: randomized n_iter: 5 random_state: tol: 0.0 memory: n_jobs: 1 transformer_weights: - gordo_components.model.models.{model}: kind: {model_kind} memory: """ definition = ruamel.yaml.load(definition, Loader=ruamel.yaml.Loader) pipe = pipeline_from_definition(definition) pipeline_into_definition(pipe)
def test_load_from_definition(definition): """ Ensure serializer can load models which take other models as parameters. """ X, y = np.random.random((10, 10)), np.random.random((10, 2)) definition = yaml.load(definition, Loader=yaml.SafeLoader) model = serializer.pipeline_from_definition(definition) assert isinstance(model, MultiOutputRegressor) model.fit(X, y) model.predict(X)
def test_diff_detector_serializability(config): """ Should play well with the gordo serializer """ config = yaml.load(config) model = serializer.pipeline_from_definition(config) serializer.pipeline_into_definition(model) serialized_bytes = serializer.dumps(model) serializer.loads(serialized_bytes)
def keras_from_spec(spec: dict): _expected_keys = ("spec", "compile") if not all(k in spec for k in _expected_keys): raise ValueError( f"Expected spec to have keys: {_expected_keys}, but found {spec.keys()}" ) logger.debug(f"Building model from spec: {spec}") model = serializer.pipeline_from_definition(spec["spec"]) model.compile(**spec["compile"]) return model
def trained_model_directory(gordo_project: str, gordo_name: str, sensors: List[SensorTag]): """ Fixture: Train a basic AutoEncoder and save it to a given directory will also save some metadata with the model """ with tempfile.TemporaryDirectory() as model_dir: # This is a model collection directory collection_dir = os.path.join(model_dir, gordo_project) # Model specific to the model being trained here model_dir = os.path.join(collection_dir, gordo_name) os.makedirs(model_dir, exist_ok=True) definition = ruamel.yaml.load( """ gordo_components.model.anomaly.diff.DiffBasedAnomalyDetector: base_estimator: sklearn.pipeline.Pipeline: steps: - sklearn.preprocessing.data.MinMaxScaler - gordo_components.model.models.KerasAutoEncoder: kind: feedforward_hourglass memory: """, Loader=ruamel.yaml.Loader, ) model = serializer.pipeline_from_definition(definition) X = np.random.random(size=len(sensors) * 10).reshape(10, len(sensors)) model.fit(X, X) serializer.dump( model, model_dir, metadata={ "dataset": { "tag_list": sensors, "resolution": "10T", "target_tag_list": sensors, }, "name": "machine-1", "model": { "model-offset": 0 }, "user-defined": { "model-name": "test-model" }, }, ) yield collection_dir
def test_pipeline_from_definition(self): for raw_yaml, model, model_kind in self.setup_gen(): self.assertTrue(model) logger.info(raw_yaml) config = yaml.load(raw_yaml) logger.debug("{}".format(config)) config_clone = copy.deepcopy(config) # To ensure no mutation occurs pipe = pipeline_from_definition(config) # Test that the original config matches the one passed; no mutation self.assertEqual(config, config_clone) # Special tests that defining non-default argument holds for a # 'key: ' is evaled to 'key=None' if "memory: /tmp" in raw_yaml: self.assertEqual(pipe.steps[2][1].transformer_list[1][1].memory, "/tmp") self._verify_pipe(pipe, model, model_kind)
def build( name, output_dir, model_config, data_config, metadata, model_register_dir, print_cv_scores, model_parameter, model_location_file, data_provider_threads, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. model_location_file: str/path Path to a file to open and write the location of the serialized model to. data_provider_threads: int Number of threads to use for the data provider when fetching data. """ # TODO: Move all data related input from environment variable to data_config, # TODO: thereby removing all these data_config['variable'] lines data_config["tag_list"] = data_config.pop("tags") # TODO: Move parsing from here, into the InfluxDataSet class data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date") ) # TODO: Move parsing from here, into the InfluxDataSet class data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads) asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition(pipeline_from_definition(model_config)) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir ) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model" ) model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) # Write out the model location to this file. model_location_file.write(model_location) return 0
def build_model( name: str, model_config: dict, data_config: Union[GordoBaseDataset, dict], metadata: dict, ): """ Build a model and serialize to a directory for later serving. Parameters ---------- name: str Name of model to be built model_config: dict Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization. Example:: {'type': 'KerasAutoEncoder', 'kind': 'feedforward_hourglass'} data_config: dict Mapping of the Dataset to initialize, following the same logic as model_config. metadata: dict Mapping of arbitrary metadata data. Returns ------- Tuple[sklearn.base.BaseEstimator, dict] """ # Get the dataset from config logger.debug(f"Initializing Dataset with config {data_config}") dataset = (data_config if isinstance(data_config, GordoBaseDataset) else _get_dataset(data_config)) logger.debug("Fetching training data") start = time.time() X, y = dataset.get_data() time_elapsed_data = time.time() - start # Get the model and dataset logger.debug(f"Initializing Model with config: {model_config}") model = serializer.pipeline_from_definition(model_config) # Cross validate logger.debug(f"Starting to do cross validation") start = time.time() scores: Dict[str, Any] if hasattr(model, "score"): cv_scores = cross_val_score(model, X, y, cv=TimeSeriesSplit(n_splits=3)) scores = { "explained-variance": { "mean": cv_scores.mean(), "std": cv_scores.std(), "max": cv_scores.max(), "min": cv_scores.min(), "raw-scores": cv_scores.tolist(), } } else: logger.debug("Unable to score model, has no attribute 'score'.") scores = dict() cv_duration_sec = time.time() - start # Train logger.debug("Starting to train model.") start = time.time() model.fit(X, y) time_elapsed_model = time.time() - start metadata = {"user-defined": metadata} metadata["name"] = name metadata["dataset"] = dataset.get_metadata() utc_dt = datetime.datetime.now(datetime.timezone.utc) metadata["model"] = { "model-creation-date": str(utc_dt.astimezone()), "model-builder-version": __version__, "model-config": model_config, "data-query-duration-sec": time_elapsed_data, "model-training-duration-sec": time_elapsed_model, "cross-validation": { "cv-duration-sec": cv_duration_sec, "scores": scores }, } gordobase_final_step = _get_final_gordo_base_step(model) if gordobase_final_step: metadata["model"].update(gordobase_final_step.get_metadata()) return model, metadata
def build_model( name: str, model_config: dict, data_config: Union[GordoBaseDataset, dict], metadata: dict, evaluation_config: dict = {"cv_mode": "full_build"}, ) -> Tuple[Union[BaseEstimator, None], dict]: """ Build a model and serialize to a directory for later serving. Parameters ---------- name: str Name of model to be built model_config: dict Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization. Example:: {'type': 'KerasAutoEncoder', 'kind': 'feedforward_hourglass'} data_config: dict Mapping of the Dataset to initialize, following the same logic as model_config. metadata: dict Mapping of arbitrary metadata data. evaluation_config: dict Dict of parameters which are exposed to build_model. - cv_mode: str String which enables three different modes, represented as a key value in evaluation_config: * cross_val_only: Only perform cross validation * build_only: Skip cross validation and only build the model * full_build: Cross validation and full build of the model, default value Example:: {"cv_mode": "cross_val_only"} Returns ------- Tuple[Optional[sklearn.base.BaseEstimator], dict] """ # Get the dataset from config logger.debug(f"Initializing Dataset with config {data_config}") dataset = (data_config if isinstance(data_config, GordoBaseDataset) else _get_dataset(data_config)) logger.debug("Fetching training data") start = time.time() X, y = dataset.get_data() time_elapsed_data = time.time() - start # Get the model and dataset logger.debug(f"Initializing Model with config: {model_config}") model = serializer.pipeline_from_definition(model_config) cv_duration_sec = None if evaluation_config["cv_mode"].lower() in ("cross_val_only", "full_build"): metrics_list = [ explained_variance_score, r2_score, mean_squared_error, mean_absolute_error, ] # Cross validate logger.debug("Starting cross validation") start = time.time() scores: Dict[str, Any] = dict() if hasattr(model, "predict"): metrics_dict = get_metrics_dict(metrics_list, y) cv = cross_validate( model, X, y, scoring=metrics_dict, return_estimator=True, cv=TimeSeriesSplit(n_splits=3), ) for metric, test_metric in map(lambda k: (k, f"test_{k}"), metrics_dict): val = { "fold-mean": cv[test_metric].mean(), "fold-std": cv[test_metric].std(), "fold-max": cv[test_metric].max(), "fold-min": cv[test_metric].min(), } val.update({ f"fold-{i + 1}": raw_value for i, raw_value in enumerate(cv[test_metric].tolist()) }) scores.update({metric: val}) else: logger.debug("Unable to score model, has no attribute 'predict'.") scores = dict() cv_duration_sec = time.time() - start # If cross_val_only, return the cv_scores and empty model. if evaluation_config["cv_mode"] == "cross_val_only": metadata["model"] = { "cross-validation": { "cv-duration-sec": cv_duration_sec, "scores": scores, } } return None, metadata else: # Setting cv scores to zero when not used. scores = dict() # Train logger.debug("Starting to train model.") start = time.time() model.fit(X, y) time_elapsed_model = time.time() - start metadata = {"user-defined": metadata} metadata["name"] = name metadata["dataset"] = dataset.get_metadata() utc_dt = datetime.datetime.now(datetime.timezone.utc) metadata["model"] = { "model-offset": _determine_offset(model, X), "model-creation-date": str(utc_dt.astimezone()), "model-builder-version": __version__, "model-config": model_config, "data-query-duration-sec": time_elapsed_data, "model-training-duration-sec": time_elapsed_model, "cross-validation": { "cv-duration-sec": cv_duration_sec, "scores": scores }, } metadata["model"].update(_get_metadata(model)) return model, metadata
def build( name, output_dir, model_config, data_config, data_provider, metadata, model_register_dir, print_cv_scores, model_parameter, evaluation_config, ): """ Build a model and deposit it into 'output_dir' given the appropriate config settings. \b Parameters ---------- name: str Name given to the model to build output_dir: str Directory to save model & metadata to. model_config: str String containing a yaml which will be parsed to a dict which will be used in initializing the model. Should also contain key 'type' which references the model to use. ie. KerasAutoEncoder data_config: dict kwargs to be used in intializing the dataset. Should also contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset data_provider: str A quoted data provider configuration in JSON/YAML format. Should also contain key 'type' which references the data provider to use. Example:: '{"type": "DataLakeProvider", "storename" : "example_store"}' metadata: dict Any additional metadata to save under the key 'user-defined' model_register_dir: path Path to a directory which will index existing models and their locations, used for re-using old models instead of rebuilding them. If omitted then always rebuild print_cv_scores: bool Print cross validation scores to stdout model_parameter: List[Tuple] List of model key-values, wheres the values will be injected into the model config wherever there is a jinja variable with the key. evaluation_config: dict Dict of parameters which are exposed to build_model. - cv_mode: str String which enables three different modes, represented as a key value in evaluation_config: * cross_val_only: Only perform cross validation * build_only: Skip cross validation and only build the model * full_build: Cross validation and full build of the model, default value Example:: {"cv_mode": "cross_val_only"} """ data_config["tag_list"] = data_config.pop("tags") data_config["from_ts"] = dateutil.parser.isoparse( data_config.pop("train_start_date")) data_config["to_ts"] = dateutil.parser.isoparse( data_config.pop("train_end_date")) # Set default data provider for data config data_config["data_provider"] = data_provider asset = data_config.get("asset", None) tag_list = normalize_sensor_tags(data_config["tag_list"], asset) data_config["tag_list"] = tag_list # Normalize target tag list if present if "target_tag_list" in data_config: target_tag_list = normalize_sensor_tags(data_config["target_tag_list"], asset) data_config["target_tag_list"] = target_tag_list logger.info(f"Building, output will be at: {output_dir}") logger.info(f"Raw model config: {model_config}") logger.info(f"Data config: {data_config}") logger.info(f"Register dir: {model_register_dir}") model_parameter = dict(model_parameter) model_config = expand_model(model_config, model_parameter) model_config = yaml.full_load(model_config) # Convert the config into a pipeline, and back into definition to ensure # all default parameters are part of the config. logger.debug(f"Ensuring the passed model config is fully expanded.") model_config = pipeline_into_definition( pipeline_from_definition(model_config)) logger.debug(f"Fully expanded model config: {model_config}") if evaluation_config["cv_mode"] == "cross_val_only": cache_model_location = None if model_register_dir is not None: cache_key = calculate_model_key(name, model_config, data_config, evaluation_config, metadata=metadata) cache_model_location = check_cache(model_register_dir, cache_key) if cache_model_location: metadata = load_metadata(cache_model_location) else: _, metadata = build_model(name, model_config, data_config, metadata, evaluation_config) else: model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, evaluation_config=evaluation_config, ) metadata = load_metadata(model_location) # If the model is cached but without CV scores then we force a rebuild. We do this # by deleting the entry in the cache and then rerun `provide_saved_model` # (leaving the old model laying around) if print_cv_scores: retrieved_metadata = metadata all_scores = get_all_score_strings(retrieved_metadata) if not all_scores: logger.warning( "Found that loaded model does not have cross validation values " "even though we were asked to print them, clearing cache and " "rebuilding model") model_location = provide_saved_model( name, model_config, data_config, metadata, output_dir, model_register_dir, replace_cache=True, evaluation_config=evaluation_config, ) saved_metadata = load_metadata(model_location) all_scores = get_all_score_strings(saved_metadata) for score in all_scores: print(score) return 0