Beispiel #1
0
    def __init__(
        self,
        base_estimator: BaseEstimator = KerasAutoEncoder(kind="feedforward_hourglass"),
        scaler: TransformerMixin = RobustScaler(),
    ):
        """
        Classifier which wraps a ``base_estimator`` and provides a diff error
        based approach to anomaly detection.

        It trains a ``scaler`` to the target **after** training, purely for
        error calculations. The underlying ``base_estimator`` is trained
        with the original, unscaled, ``y``.

        Parameters
        ----------
        base_estimator: sklearn.base.BaseEstimator
            The model to which normal ``.fit``, ``.predict`` methods will be used.
            defaults to py:class:`gordo_components.model.models.KerasAutoEncoder` with
            ``kind='feedforward_hourglass``
        scaler: sklearn.base.TransformerMixn
            Defaults to ``sklearn.preprocessing.RobustScaler``
            Used for transforming model output and the original ``y`` to calculate
            the difference/error in model output vs expected.
        """
        self.base_estimator = base_estimator
        self.scaler = scaler
Beispiel #2
0
def test_captures_kwarg_to_init():
    """
    Our models allow kwargs which are put into the underlying keras model or to construct
    the underlying model.
    We want to ensure into defintion captures kwargs which are part of the model
    parameters but not part of the __init__ signature
    """
    ae = KerasAutoEncoder(kind="feedforward_hourglass",
                          some_fancy_param="Howdy!")
    definition = pipeline_into_definition(ae)
    parameters = definition[
        f"{KerasAutoEncoder.__module__}.{KerasAutoEncoder.__name__}"]
    assert "some_fancy_param" in parameters
    assert parameters["some_fancy_param"] == "Howdy!"

    # And make sure we can init again
    KerasAutoEncoder(**parameters)
    def test_dump_load_keras_directly(self):

        model = KerasAutoEncoder(kind="feedforward_hourglass")

        X = np.random.random(size=100).reshape(10, 10)
        model.fit(X.copy(), X.copy())

        with TemporaryDirectory() as tmp:
            serializer.dump(model, tmp)

            model_clone = serializer.load(tmp)

            self.assertTrue(
                np.allclose(
                    model.predict(X.copy()).flatten(),
                    model_clone.predict(X.copy()).flatten(),
                ))
 def setUp(self):
     self.variations_of_same_pipeline = [
         # Normal
         Pipeline(
             [
                 ("pca1", PCA(n_components=2)),
                 (
                     "fu",
                     FeatureUnion(
                         [
                             ("pca2", PCA(n_components=3)),
                             (
                                 "pipe",
                                 Pipeline(
                                     [
                                         ("minmax", MinMaxScaler()),
                                         ("truncsvd", TruncatedSVD(n_components=2)),
                                     ]
                                 ),
                             ),
                         ]
                     ),
                 ),
                 ("ae", KerasAutoEncoder(kind="feedforward_hourglass")),
             ]
         ),
         # MinMax initialized (wrongly) with a list
         Pipeline(
             [
                 ("pca1", PCA(n_components=2)),
                 (
                     "fu",
                     FeatureUnion(
                         [
                             ("pca2", PCA(n_components=3)),
                             (
                                 "pipe",
                                 Pipeline(
                                     [
                                         ("minmax", MinMaxScaler([0, 1])),
                                         ("truncsvd", TruncatedSVD(n_components=2)),
                                     ]
                                 ),
                             ),
                         ]
                     ),
                 ),
                 ("ae", KerasAutoEncoder(kind="feedforward_hourglass")),
             ]
         ),
         # MinMax initialized with tuple
         Pipeline(
             [
                 ("pca1", PCA(n_components=2)),
                 (
                     "fu",
                     FeatureUnion(
                         [
                             ("pca2", PCA(n_components=3)),
                             (
                                 "pipe",
                                 Pipeline(
                                     [
                                         ("minmax", MinMaxScaler((0, 1))),
                                         ("truncsvd", TruncatedSVD(n_components=2)),
                                     ]
                                 ),
                             ),
                         ]
                     ),
                 ),
                 ("ae", KerasAutoEncoder(kind="feedforward_hourglass")),
             ]
         ),
         # First pipeline without explicit steps param, other with.
         Pipeline(
             [
                 ("pca1", PCA(n_components=2)),
                 (
                     "fu",
                     FeatureUnion(
                         [
                             ("pca2", PCA(n_components=3)),
                             (
                                 "pipe",
                                 Pipeline(
                                     steps=[
                                         ("minmax", MinMaxScaler((0, 1))),
                                         ("truncsvd", TruncatedSVD(n_components=2)),
                                     ]
                                 ),
                             ),
                         ]
                     ),
                 ),
                 ("ae", KerasAutoEncoder(kind="feedforward_hourglass")),
             ]
         ),
     ]
    def test_pipeline_serialization(self):

        pipe = Pipeline([
            ("pca1", PCA(n_components=10)),
            (
                "fu",
                FeatureUnion([
                    ("pca2", PCA(n_components=3)),
                    (
                        "pipe",
                        Pipeline([
                            ("minmax", MinMaxScaler()),
                            ("truncsvd", TruncatedSVD(n_components=7)),
                        ]),
                    ),
                ]),
            ),
            ("ae", KerasAutoEncoder(kind="feedforward_hourglass")),
        ])

        X = np.random.random(size=100).reshape(10, 10)
        pipe.fit(X.copy(), X.copy())

        with TemporaryDirectory() as tmp:

            # Test dump
            metadata = {"key": "value"}
            serializer.dump(pipe, tmp, metadata=metadata)

            # Assert that a dirs are created for each step in Pipeline
            expected_structure = OrderedDict([
                ("n_step=000-class=sklearn.pipeline.Pipeline",
                 "metadata.json"),
                (
                    "n_step=000-class=sklearn.pipeline.Pipeline",
                    OrderedDict([
                        (
                            "n_step=000-class=sklearn.decomposition.pca.PCA",
                            "pca1.pkl.gz",
                        ),
                        (
                            "n_step=001-class=sklearn.pipeline.FeatureUnion",
                            "params.json",
                        ),
                        (
                            "n_step=001-class=sklearn.pipeline.FeatureUnion",
                            OrderedDict([
                                (
                                    "n_step=000-class=sklearn.decomposition.pca.PCA",
                                    "pca2.pkl.gz",
                                ),
                                (
                                    "n_step=001-class=sklearn.pipeline.Pipeline",
                                    OrderedDict([
                                        (
                                            "n_step=000-class=sklearn.preprocessing.data.MinMaxScaler",
                                            "minmax.pkl.gz",
                                        ),
                                        (
                                            "n_step=001-class=sklearn.decomposition.truncated_svd.TruncatedSVD",
                                            "truncsvd.pkl.gz",
                                        ),
                                    ]),
                                ),
                            ]),
                        ),
                        (
                            "n_step=002-class=gordo_components.model.models.KerasAutoEncoder",
                            "model.h5",
                        ),
                        (
                            "n_step=002-class=gordo_components.model.models.KerasAutoEncoder",
                            "params.json",
                        ),
                    ]),
                ),
            ])

            self._structure_verifier(prefix_dir=tmp,
                                     structure=expected_structure)

            # Test load from the serialized pipeline above
            pipe_clone = serializer.load(tmp)
            metadata_clone = serializer.load_metadata(tmp)

            # Ensure the metadata was saved and loaded back
            self.assertEqual(metadata, metadata_clone)

            # Verify same state for both pipelines
            y_hat_pipe1 = pipe.predict(X.copy()).flatten()
            y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten()
            self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))

            # Now use dumps/loads
            serialized = serializer.dumps(pipe)
            pipe_clone = serializer.loads(serialized)

            # Verify same state for both pipelines
            y_hat_pipe1 = pipe.predict(X.copy()).flatten()
            y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten()
            self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))
            self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))

            # Now use dumps/loads
            serialized = serializer.dumps(pipe)
            pipe_clone = serializer.loads(serialized)

            # Verify same state for both pipelines
            y_hat_pipe1 = pipe.predict(X.copy()).flatten()
            y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten()
            self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))


@pytest.mark.parametrize(
    "model",
    [
        KerasAutoEncoder(kind="feedforward_hourglass"),
        DiffBasedAnomalyDetector(base_estimator=TransformedTargetRegressor(
            regressor=KerasAutoEncoder(kind="feedforward_symmetric"),
            transformer=MinMaxScaler(),
        )),
        TransformedTargetRegressor(regressor=Pipeline(steps=[
            ("stp1", MinMaxScaler()),
            ("stp2", KerasAutoEncoder(kind="feedforward_symmetric")),
        ])),
    ],
)
def test_dump_load_models(model):

    X = np.random.random(size=100).reshape(10, 10)
    model.fit(X.copy(), X.copy())
    model_out = model.predict(X.copy())