def test_imputer_from_definition(config_str: str):
    """
    Ensure it plays well with the gordo serializer
    """
    config = yaml.safe_load(config_str)
    model = serializer.pipeline_from_definition(config)

    if isinstance(model, Pipeline):
        assert isinstance(model.steps[-1][1], InfImputer)
    else:
        assert isinstance(model, InfImputer)

    serializer.pipeline_from_definition(serializer.pipeline_into_definition(model))
    def test_into_from(self):
        """
        Pass Pipeline into definition, and then from that definition
        """
        from gordo_components.model.transformer_funcs.general import multiply_by

        self.factories = register_model_builder.factories
        for model in self.factories.keys():

            for model_kind in self.factories[model].keys():
                pipe = Pipeline(
                    [
                        ("step_0", PCA(n_components=2)),
                        (
                            "step_1",
                            FeatureUnion(
                                [
                                    ("step_0", PCA(n_components=3)),
                                    (
                                        "step_1",
                                        Pipeline(
                                            steps=[
                                                ("step_0", MinMaxScaler((0, 1))),
                                                (
                                                    "step_1",
                                                    TruncatedSVD(n_components=2),
                                                ),
                                            ]
                                        ),
                                    ),
                                ]
                            ),
                        ),
                        (
                            "step_2",
                            FunctionTransformer(
                                func=multiply_by, kw_args={"factor": 1}
                            ),
                        ),
                        (
                            "step_3",
                            pydoc.locate(f"gordo_components.model.models.{model}")(
                                kind=model_kind
                            ),
                        ),
                    ]
                )

                pipeline_from_definition(pipeline_into_definition(pipe))
Example #3
0
def trained_model_directory(sensors: List[SensorTag]):
    """
    Fixture: Train a basic AutoEncoder and save it to a given directory
    will also save some metadata with the model
    """
    with tempfile.TemporaryDirectory() as tmp_dir:
        definition = ruamel.yaml.load(
            """
            sklearn.pipeline.Pipeline:
                steps:
                    - sklearn.preprocessing.data.MinMaxScaler
                    - gordo_components.model.models.KerasAutoEncoder:
                        kind: feedforward_hourglass
                memory:
            """,
            Loader=ruamel.yaml.Loader,
        )
        model = serializer.pipeline_from_definition(definition)
        X = np.random.random(size=len(sensors) * 10).reshape(10, len(sensors))
        model.fit(X, X)
        serializer.dump(
            model,
            tmp_dir,
            metadata={
                "dataset": {
                    "tag_list": sensors,
                    "resolution": "10T"
                },
                "user-defined": {
                    "model-name": "test-model",
                    "machine-name": "machine-1",
                },
            },
        )
        yield tmp_dir
Example #4
0
def test_raw_keras_part_of_pipeline():
    """
    It should play well, when tucked into a sklearn.pipeline.Pipeline
    """
    X, y = np.random.random((100, 4)), np.random.random((100, 1))

    config_str = """
    sklearn.pipeline.Pipeline:
        steps:
            - sklearn.decomposition.pca.PCA:
                n_components: 4
            - gordo_components.model.models.KerasRawModelRegressor:
                kind:
                    compile:
                        loss: mse
                        optimizer: adam
                    spec:
                        tensorflow.keras.models.Sequential:
                            layers:
                                - tensorflow.keras.layers.Dense:
                                    units: 4
                                - tensorflow.keras.layers.Dense:
                                    units: 1
    """
    config = yaml.safe_load(config_str)
    pipe = serializer.pipeline_from_definition(config)
    assert isinstance(pipe, Pipeline)

    pipe.fit(X, y)
    out = pipe.predict(X)
    assert len(out) == len(y)
Example #5
0
def build_model(resampled_dataframe, epochs=5, batch_size=10):
    config = yaml.load(
        f"""
        sklearn.pipeline.Pipeline:
            steps:
              - sklearn.preprocessing.data.MinMaxScaler
              - gordo_components.model.models.KerasAutoEncoder:
                  kind: feedforward_hourglass
                  epochs: {epochs}
                  batch_size: {batch_size}


        """
    )
    pipe = serializer.pipeline_from_definition(config)

    print("Fit model to first part of data")
    train_until = int(len(resampled_dataframe) / 2)
    model = pipe.fit(resampled_dataframe.iloc[:train_until])

    print("Run data through model for prediction")
    predicted_data = model.predict(resampled_dataframe)
    # Inverse transform the model pipeline, since the autoencoders are a bit weird
    # with regards to their output (currently)
    predicted_data = model_io.get_inverse_transformed_input(model, predicted_data)
    anomalies = make_anomalies(resampled_dataframe, predicted_data)
    anomalies = pd.DataFrame(anomalies, index=resampled_dataframe.index)
    anomalies_mean_training = anomalies.iloc[:train_until].mean()[0]

    return (anomalies, anomalies_mean_training, predicted_data, train_until)
 def test_from_into(self):
     """
     Create pipeline from definition, and create from that definition
     """
     self.factories = register_model_builder.factories
     for model in self.factories.keys():
         for model_kind in self.factories[model].keys():
             definition = f"""
                 sklearn.pipeline.Pipeline:
                     steps:
                         - sklearn.decomposition.pca.PCA:
                             n_components: 2
                             copy: true
                             whiten: false
                             svd_solver: auto
                             tol: 0.0
                             iterated_power: auto
                             random_state:
                         - sklearn.preprocessing._function_transformer.FunctionTransformer:
                             func: gordo_components.model.transformer_funcs.general.multiply_by
                             kw_args:
                                 factor: 1
                             inverse_func: gordo_components.model.transformer_funcs.general.multiply_by
                             inv_kw_args:
                                 factor: 1
                         - sklearn.pipeline.FeatureUnion:
                             transformer_list:
                             - sklearn.decomposition.pca.PCA:
                                 n_components: 3
                                 copy: true
                                 whiten: false
                                 svd_solver: auto
                                 tol: 0.0
                                 iterated_power: auto
                                 random_state:
                             - sklearn.pipeline.Pipeline:
                                 steps:
                                 - sklearn.preprocessing.data.MinMaxScaler:
                                     feature_range:
                                     - 0
                                     - 1
                                     copy: true
                                 - sklearn.decomposition.truncated_svd.TruncatedSVD:
                                     n_components: 2
                                     algorithm: randomized
                                     n_iter: 5
                                     random_state:
                                     tol: 0.0
                                 memory:
                             n_jobs: 1
                             transformer_weights:
                         - gordo_components.model.models.{model}:
                             kind: {model_kind}
                     memory:
                 """
             definition = ruamel.yaml.load(definition, Loader=ruamel.yaml.Loader)
             pipe = pipeline_from_definition(definition)
             pipeline_into_definition(pipe)
def test_load_from_definition(definition):
    """
    Ensure serializer can load models which take other models as parameters.
    """
    X, y = np.random.random((10, 10)), np.random.random((10, 2))
    definition = yaml.load(definition, Loader=yaml.SafeLoader)
    model = serializer.pipeline_from_definition(definition)
    assert isinstance(model, MultiOutputRegressor)
    model.fit(X, y)
    model.predict(X)
Example #8
0
def test_diff_detector_serializability(config):
    """
    Should play well with the gordo serializer
    """
    config = yaml.load(config)

    model = serializer.pipeline_from_definition(config)
    serializer.pipeline_into_definition(model)
    serialized_bytes = serializer.dumps(model)
    serializer.loads(serialized_bytes)
    def keras_from_spec(spec: dict):
        _expected_keys = ("spec", "compile")
        if not all(k in spec for k in _expected_keys):
            raise ValueError(
                f"Expected spec to have keys: {_expected_keys}, but found {spec.keys()}"
            )
        logger.debug(f"Building model from spec: {spec}")

        model = serializer.pipeline_from_definition(spec["spec"])
        model.compile(**spec["compile"])
        return model
def trained_model_directory(gordo_project: str, gordo_name: str,
                            sensors: List[SensorTag]):
    """
    Fixture: Train a basic AutoEncoder and save it to a given directory
    will also save some metadata with the model
    """
    with tempfile.TemporaryDirectory() as model_dir:

        # This is a model collection directory
        collection_dir = os.path.join(model_dir, gordo_project)

        # Model specific to the model being trained here
        model_dir = os.path.join(collection_dir, gordo_name)
        os.makedirs(model_dir, exist_ok=True)

        definition = ruamel.yaml.load(
            """
            gordo_components.model.anomaly.diff.DiffBasedAnomalyDetector:
                base_estimator:
                    sklearn.pipeline.Pipeline:
                        steps:
                            - sklearn.preprocessing.data.MinMaxScaler
                            - gordo_components.model.models.KerasAutoEncoder:
                                kind: feedforward_hourglass
                        memory:
            """,
            Loader=ruamel.yaml.Loader,
        )
        model = serializer.pipeline_from_definition(definition)
        X = np.random.random(size=len(sensors) * 10).reshape(10, len(sensors))
        model.fit(X, X)
        serializer.dump(
            model,
            model_dir,
            metadata={
                "dataset": {
                    "tag_list": sensors,
                    "resolution": "10T",
                    "target_tag_list": sensors,
                },
                "name": "machine-1",
                "model": {
                    "model-offset": 0
                },
                "user-defined": {
                    "model-name": "test-model"
                },
            },
        )
        yield collection_dir
    def test_pipeline_from_definition(self):

        for raw_yaml, model, model_kind in self.setup_gen():
            self.assertTrue(model)
            logger.info(raw_yaml)
            config = yaml.load(raw_yaml)
            logger.debug("{}".format(config))

            config_clone = copy.deepcopy(config)  # To ensure no mutation occurs
            pipe = pipeline_from_definition(config)

            # Test that the original config matches the one passed; no mutation
            self.assertEqual(config, config_clone)

            # Special tests that defining non-default argument holds for a
            # 'key:  ' is evaled to 'key=None'
            if "memory: /tmp" in raw_yaml:
                self.assertEqual(pipe.steps[2][1].transformer_list[1][1].memory, "/tmp")
            self._verify_pipe(pipe, model, model_kind)
Example #12
0
def build(
    name,
    output_dir,
    model_config,
    data_config,
    metadata,
    model_register_dir,
    print_cv_scores,
    model_parameter,
    model_location_file,
    data_provider_threads,
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    name: str
        Name given to the model to build
    output_dir: str
        Directory to save model & metadata to.
    model_config: str
        String containing a yaml which will be parsed to a dict which will be used in
        initializing the model. Should also contain key 'type' which references the
        model to use. ie. KerasAutoEncoder
    data_config: dict
        kwargs to be used in intializing the dataset. Should also
        contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset
    metadata: dict
        Any additional metadata to save under the key 'user-defined'
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.
    model_location_file: str/path
        Path to a file to open and write the location of the serialized model to.
    data_provider_threads: int
        Number of threads to use for the data provider when fetching data.
    """

    # TODO: Move all data related input from environment variable to data_config,
    # TODO: thereby removing all these data_config['variable'] lines

    data_config["tag_list"] = data_config.pop("tags")

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["from_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_start_date")
    )

    # TODO: Move parsing from here, into the InfluxDataSet class
    data_config["to_ts"] = dateutil.parser.isoparse(data_config.pop("train_end_date"))

    # Set default data provider for data config
    data_config["data_provider"] = DataLakeProvider(threads=data_provider_threads)
    asset = data_config.get("asset", None)
    tag_list = normalize_sensor_tags(data_config["tag_list"], asset)

    data_config["tag_list"] = tag_list

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Raw model config: {model_config}")
    logger.info(f"Data config: {data_config}")
    logger.info(f"Register dir: {model_register_dir}")

    model_parameter = dict(model_parameter)
    model_config = expand_model(model_config, model_parameter)
    model_config = yaml.full_load(model_config)

    # Convert the config into a pipeline, and back into definition to ensure
    # all default parameters are part of the config.
    logger.debug(f"Ensuring the passed model config is fully expanded.")
    model_config = pipeline_into_definition(pipeline_from_definition(model_config))

    model_location = provide_saved_model(
        name, model_config, data_config, metadata, output_dir, model_register_dir
    )
    # If the model is cached but without CV scores then we force a rebuild. We do this
    # by deleting the entry in the cache and then rerun `provide_saved_model`
    # (leaving the old model laying around)
    if print_cv_scores:
        saved_metadata = load_metadata(model_location)
        all_scores = get_all_score_strings(saved_metadata)
        if not all_scores:
            logger.warning(
                "Found that loaded model does not have cross validation values "
                "even though we were asked to print them, clearing cache and "
                "rebuilding model"
            )

            model_location = provide_saved_model(
                name,
                model_config,
                data_config,
                metadata,
                output_dir,
                model_register_dir,
                replace_cache=True,
            )
            saved_metadata = load_metadata(model_location)
            all_scores = get_all_score_strings(saved_metadata)

        for score in all_scores:
            print(score)

    # Write out the model location to this file.
    model_location_file.write(model_location)
    return 0
Example #13
0
def build_model(
    name: str,
    model_config: dict,
    data_config: Union[GordoBaseDataset, dict],
    metadata: dict,
):
    """
    Build a model and serialize to a directory for later serving.

    Parameters
    ----------
    name: str
        Name of model to be built
    model_config: dict
        Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization.
        Example::

          {'type': 'KerasAutoEncoder',
           'kind': 'feedforward_hourglass'}

    data_config: dict
        Mapping of the Dataset to initialize, following the same logic as model_config.
    metadata: dict
        Mapping of arbitrary metadata data.

    Returns
    -------
        Tuple[sklearn.base.BaseEstimator, dict]
    """
    # Get the dataset from config
    logger.debug(f"Initializing Dataset with config {data_config}")

    dataset = (data_config if isinstance(data_config, GordoBaseDataset) else
               _get_dataset(data_config))

    logger.debug("Fetching training data")
    start = time.time()

    X, y = dataset.get_data()

    time_elapsed_data = time.time() - start

    # Get the model and dataset
    logger.debug(f"Initializing Model with config: {model_config}")
    model = serializer.pipeline_from_definition(model_config)

    # Cross validate
    logger.debug(f"Starting to do cross validation")
    start = time.time()

    scores: Dict[str, Any]
    if hasattr(model, "score"):
        cv_scores = cross_val_score(model,
                                    X,
                                    y,
                                    cv=TimeSeriesSplit(n_splits=3))
        scores = {
            "explained-variance": {
                "mean": cv_scores.mean(),
                "std": cv_scores.std(),
                "max": cv_scores.max(),
                "min": cv_scores.min(),
                "raw-scores": cv_scores.tolist(),
            }
        }
    else:
        logger.debug("Unable to score model, has no attribute 'score'.")
        scores = dict()

    cv_duration_sec = time.time() - start

    # Train
    logger.debug("Starting to train model.")
    start = time.time()
    model.fit(X, y)
    time_elapsed_model = time.time() - start

    metadata = {"user-defined": metadata}
    metadata["name"] = name
    metadata["dataset"] = dataset.get_metadata()
    utc_dt = datetime.datetime.now(datetime.timezone.utc)
    metadata["model"] = {
        "model-creation-date": str(utc_dt.astimezone()),
        "model-builder-version": __version__,
        "model-config": model_config,
        "data-query-duration-sec": time_elapsed_data,
        "model-training-duration-sec": time_elapsed_model,
        "cross-validation": {
            "cv-duration-sec": cv_duration_sec,
            "scores": scores
        },
    }

    gordobase_final_step = _get_final_gordo_base_step(model)
    if gordobase_final_step:
        metadata["model"].update(gordobase_final_step.get_metadata())

    return model, metadata
def build_model(
    name: str,
    model_config: dict,
    data_config: Union[GordoBaseDataset, dict],
    metadata: dict,
    evaluation_config: dict = {"cv_mode": "full_build"},
) -> Tuple[Union[BaseEstimator, None], dict]:
    """
    Build a model and serialize to a directory for later serving.

    Parameters
    ----------
    name: str
        Name of model to be built
    model_config: dict
        Mapping of Model to initialize and any additional kwargs which are to be used in it's initialization.
        Example::

          {'type': 'KerasAutoEncoder',
           'kind': 'feedforward_hourglass'}

    data_config: dict
        Mapping of the Dataset to initialize, following the same logic as model_config.
    metadata: dict
        Mapping of arbitrary metadata data.
    evaluation_config: dict
        Dict of parameters which are exposed to build_model.
            - cv_mode: str
                String which enables three different modes, represented as a key value in evaluation_config:
                * cross_val_only: Only perform cross validation
                * build_only: Skip cross validation and only build the model
                * full_build: Cross validation and full build of the model, default value
                Example::

                    {"cv_mode": "cross_val_only"}


    Returns
    -------
        Tuple[Optional[sklearn.base.BaseEstimator], dict]
    """
    # Get the dataset from config
    logger.debug(f"Initializing Dataset with config {data_config}")

    dataset = (data_config if isinstance(data_config, GordoBaseDataset) else
               _get_dataset(data_config))

    logger.debug("Fetching training data")
    start = time.time()

    X, y = dataset.get_data()

    time_elapsed_data = time.time() - start

    # Get the model and dataset
    logger.debug(f"Initializing Model with config: {model_config}")
    model = serializer.pipeline_from_definition(model_config)

    cv_duration_sec = None

    if evaluation_config["cv_mode"].lower() in ("cross_val_only",
                                                "full_build"):
        metrics_list = [
            explained_variance_score,
            r2_score,
            mean_squared_error,
            mean_absolute_error,
        ]
        # Cross validate
        logger.debug("Starting cross validation")
        start = time.time()
        scores: Dict[str, Any] = dict()
        if hasattr(model, "predict"):

            metrics_dict = get_metrics_dict(metrics_list, y)

            cv = cross_validate(
                model,
                X,
                y,
                scoring=metrics_dict,
                return_estimator=True,
                cv=TimeSeriesSplit(n_splits=3),
            )
            for metric, test_metric in map(lambda k: (k, f"test_{k}"),
                                           metrics_dict):
                val = {
                    "fold-mean": cv[test_metric].mean(),
                    "fold-std": cv[test_metric].std(),
                    "fold-max": cv[test_metric].max(),
                    "fold-min": cv[test_metric].min(),
                }
                val.update({
                    f"fold-{i + 1}": raw_value
                    for i, raw_value in enumerate(cv[test_metric].tolist())
                })
                scores.update({metric: val})

        else:
            logger.debug("Unable to score model, has no attribute 'predict'.")
            scores = dict()

        cv_duration_sec = time.time() - start

        # If cross_val_only, return the cv_scores and empty model.
        if evaluation_config["cv_mode"] == "cross_val_only":
            metadata["model"] = {
                "cross-validation": {
                    "cv-duration-sec": cv_duration_sec,
                    "scores": scores,
                }
            }
            return None, metadata
    else:
        # Setting cv scores to zero when not used.
        scores = dict()
    # Train
    logger.debug("Starting to train model.")
    start = time.time()
    model.fit(X, y)
    time_elapsed_model = time.time() - start

    metadata = {"user-defined": metadata}
    metadata["name"] = name
    metadata["dataset"] = dataset.get_metadata()
    utc_dt = datetime.datetime.now(datetime.timezone.utc)
    metadata["model"] = {
        "model-offset": _determine_offset(model, X),
        "model-creation-date": str(utc_dt.astimezone()),
        "model-builder-version": __version__,
        "model-config": model_config,
        "data-query-duration-sec": time_elapsed_data,
        "model-training-duration-sec": time_elapsed_model,
        "cross-validation": {
            "cv-duration-sec": cv_duration_sec,
            "scores": scores
        },
    }

    metadata["model"].update(_get_metadata(model))
    return model, metadata
Example #15
0
def build(
    name,
    output_dir,
    model_config,
    data_config,
    data_provider,
    metadata,
    model_register_dir,
    print_cv_scores,
    model_parameter,
    evaluation_config,
):
    """
    Build a model and deposit it into 'output_dir' given the appropriate config
    settings.

    \b
    Parameters
    ----------
    name: str
        Name given to the model to build
    output_dir: str
        Directory to save model & metadata to.
    model_config: str
        String containing a yaml which will be parsed to a dict which will be used in
        initializing the model. Should also contain key 'type' which references the
        model to use. ie. KerasAutoEncoder
    data_config: dict
        kwargs to be used in intializing the dataset. Should also
        contain kwarg 'type' which references the dataset to use. ie. InfluxBackedDataset
    data_provider: str
        A quoted data provider configuration in  JSON/YAML format.
        Should also contain key 'type' which references the data provider to use.

        Example::

          '{"type": "DataLakeProvider", "storename" : "example_store"}'

    metadata: dict
        Any additional metadata to save under the key 'user-defined'
    model_register_dir: path
        Path to a directory which will index existing models and their locations, used
        for re-using old models instead of rebuilding them. If omitted then always
        rebuild
    print_cv_scores: bool
        Print cross validation scores to stdout
    model_parameter: List[Tuple]
        List of model key-values, wheres the values will be injected into the model
        config wherever there is a jinja variable with the key.

    evaluation_config: dict
        Dict of parameters which are exposed to build_model.
            - cv_mode: str
                String which enables three different modes, represented as a key value in evaluation_config:
                * cross_val_only: Only perform cross validation
                * build_only: Skip cross validation and only build the model
                * full_build: Cross validation and full build of the model, default value
                Example::

                    {"cv_mode": "cross_val_only"}
    """

    data_config["tag_list"] = data_config.pop("tags")

    data_config["from_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_start_date"))

    data_config["to_ts"] = dateutil.parser.isoparse(
        data_config.pop("train_end_date"))

    # Set default data provider for data config
    data_config["data_provider"] = data_provider
    asset = data_config.get("asset", None)
    tag_list = normalize_sensor_tags(data_config["tag_list"], asset)

    data_config["tag_list"] = tag_list

    # Normalize target tag list if present
    if "target_tag_list" in data_config:
        target_tag_list = normalize_sensor_tags(data_config["target_tag_list"],
                                                asset)
        data_config["target_tag_list"] = target_tag_list

    logger.info(f"Building, output will be at: {output_dir}")
    logger.info(f"Raw model config: {model_config}")
    logger.info(f"Data config: {data_config}")
    logger.info(f"Register dir: {model_register_dir}")

    model_parameter = dict(model_parameter)
    model_config = expand_model(model_config, model_parameter)
    model_config = yaml.full_load(model_config)

    # Convert the config into a pipeline, and back into definition to ensure
    # all default parameters are part of the config.
    logger.debug(f"Ensuring the passed model config is fully expanded.")
    model_config = pipeline_into_definition(
        pipeline_from_definition(model_config))
    logger.debug(f"Fully expanded model config: {model_config}")

    if evaluation_config["cv_mode"] == "cross_val_only":

        cache_model_location = None
        if model_register_dir is not None:
            cache_key = calculate_model_key(name,
                                            model_config,
                                            data_config,
                                            evaluation_config,
                                            metadata=metadata)
            cache_model_location = check_cache(model_register_dir, cache_key)

        if cache_model_location:
            metadata = load_metadata(cache_model_location)
        else:
            _, metadata = build_model(name, model_config, data_config,
                                      metadata, evaluation_config)

    else:
        model_location = provide_saved_model(
            name,
            model_config,
            data_config,
            metadata,
            output_dir,
            model_register_dir,
            evaluation_config=evaluation_config,
        )
        metadata = load_metadata(model_location)

    # If the model is cached but without CV scores then we force a rebuild. We do this
    # by deleting the entry in the cache and then rerun `provide_saved_model`
    # (leaving the old model laying around)
    if print_cv_scores:
        retrieved_metadata = metadata
        all_scores = get_all_score_strings(retrieved_metadata)
        if not all_scores:
            logger.warning(
                "Found that loaded model does not have cross validation values "
                "even though we were asked to print them, clearing cache and "
                "rebuilding model")

            model_location = provide_saved_model(
                name,
                model_config,
                data_config,
                metadata,
                output_dir,
                model_register_dir,
                replace_cache=True,
                evaluation_config=evaluation_config,
            )
            saved_metadata = load_metadata(model_location)
            all_scores = get_all_score_strings(saved_metadata)

        for score in all_scores:
            print(score)

    return 0