def test_load_metadata(tmpdir, location):
    """
    Test load_metadata can look in directory given as well as directory above that
    along with dealing with 'FileNotFoundError' when a non-existent file is given.
    """
    model_dir = os.path.join(tmpdir, "some-model-dir")
    os.mkdir(model_dir)
    if location:
        with open(os.path.join(model_dir, location), "w") as f:
            json.dump(dict(key="value"), f)
        assert serializer.load_metadata(model_dir) == dict(key="value")
    else:
        # Attempting to load a file which doesn't exist will raise FileNotFoundError
        with pytest.raises(FileNotFoundError):
            assert serializer.load_metadata(tmpdir)
    def test_pipeline_serialization(self):

        pipe = Pipeline(
            [
                ("pca1", PCA(n_components=10)),
                (
                    "fu",
                    FeatureUnion(
                        [
                            ("pca2", PCA(n_components=3)),
                            (
                                "pipe",
                                Pipeline(
                                    [
                                        ("minmax", MinMaxScaler()),
                                        ("truncsvd", TruncatedSVD(n_components=7)),
                                    ]
                                ),
                            ),
                        ]
                    ),
                ),
                ("ae", KerasAutoEncoder(kind="feedforward_hourglass")),
            ]
        )

        X = np.random.random(size=100).reshape(10, 10)
        pipe.fit(X.copy(), X.copy())

        with TemporaryDirectory() as tmp:

            # Test dump
            metadata = {"key": "value"}
            serializer.dump(pipe, tmp, metadata=metadata)

            # Test load from the serialized pipeline above
            pipe_clone = serializer.load(tmp)
            metadata_clone = serializer.load_metadata(tmp)

            # Ensure the metadata was saved and loaded back
            self.assertEqual(metadata, metadata_clone)

            # Verify same state for both pipelines
            y_hat_pipe1 = pipe.predict(X.copy()).flatten()
            y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten()
            self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))

            # Now use dumps/loads
            serialized = serializer.dumps(pipe)
            pipe_clone = serializer.loads(serialized)

            # Verify same state for both pipelines
            y_hat_pipe1 = pipe.predict(X.copy()).flatten()
            y_hat_pipe2 = pipe_clone.predict(X.copy()).flatten()
            self.assertTrue(np.allclose(y_hat_pipe1, y_hat_pipe2))
Ejemplo n.º 3
0
def _load_compressed_metadata(directory: str, name: str):
    """
    Loads the metadata for model 'name' from directory 'directory', and returns it as a
    zlib compressed pickle, to use as little space as possible in the cache.

    Notes
    ----
    Some simple measurement indicated that a typical metadata dict uses 37kb in memory,
    while pickled it uses 8kb, and pickled-compressed it uses 4kb.

    """
    metadata = serializer.load_metadata(os.path.join(directory, name))
    return zlib.compress(pickle.dumps(metadata))
Ejemplo n.º 4
0
def metadata(trained_model_directories, trained_model_directory):
    return serializer.load_metadata(trained_model_directory)
Ejemplo n.º 5
0
    def build(
        self,
        output_dir: Optional[Union[os.PathLike, str]] = None,
        model_register_dir: Optional[Union[os.PathLike, str]] = None,
        replace_cache=False,
    ) -> Tuple[sklearn.base.BaseEstimator, Machine]:
        """
        Always return a model and its metadata.

        If ``output_dir`` is supplied, it will save the model there.
        ``model_register_dir`` points to the model cache directory which it will
        attempt to read the model from. Supplying both will then have the effect
        of both; reading from the cache and saving that cached model to the new
        output directory.

        Parameters
        ----------
        output_dir: Optional[Union[os.PathLike, str]]
            A path to where the model will be deposited.
        model_register_dir: Optional[Union[os.PathLike, str]]
            A path to a register, see `:func:gordo.util.disk_registry`.
            If this is None then always build the model, otherwise try to resolve
            the model from the registry.
        replace_cache: bool
            Forces a rebuild of the model, and replaces the entry in the cache
            with the new model.

        Returns
        -------
        Tuple[sklearn.base.BaseEstimator, Machine]
            Built model and an updated ``Machine``
        """
        if not model_register_dir:
            model, machine = self._build()
        else:
            logger.debug(
                f"Model caching activated, attempting to read model-location with key "
                f"{self.cache_key} from register {model_register_dir}")
            self.cached_model_path = self.check_cache(model_register_dir)

            if replace_cache:
                logger.info(
                    "replace_cache=True, deleting any existing cache entry")
                disk_registry.delete_value(model_register_dir, self.cache_key)
                self.cached_model_path = None

            # Load the model from previous cached directory
            if self.cached_model_path:
                model = serializer.load(self.cached_model_path)
                metadata = serializer.load_metadata(self.cached_model_path)
                metadata["metadata"][
                    "user_defined"] = self.machine.metadata.user_defined

                metadata["runtime"] = self.machine.runtime

                machine = Machine(**metadata)

            # Otherwise build and cache the model
            else:
                model, machine = self._build()
                self.cached_model_path = self._save_model(
                    model=model,
                    machine=machine,
                    output_dir=output_dir  # type: ignore
                )
                logger.info(
                    f"Built model, and deposited at {self.cached_model_path}")
                logger.info(f"Writing model-location to model registry")
                disk_registry.write_key(  # type: ignore
                    model_register_dir, self.cache_key, self.cached_model_path)

        # Save model to disk, if we're not building for cv only purposes.
        if output_dir and (self.machine.evaluation.get("cv_mode") !=
                           "cross_val_only"):
            self.cached_model_path = self._save_model(model=model,
                                                      machine=machine,
                                                      output_dir=output_dir)
        return model, machine