예제 #1
0
 def __init__(self, model_meta: Model, model_impl: Any):
     if not hasattr(model_impl, "predict"):
         raise ClearboxWrapperException(
             "Model implementation is missing required predict method.")
     if not model_meta:
         raise ClearboxWrapperException("Model is missing metadata.")
     self._model_meta = model_meta
     self._model_impl = model_impl
예제 #2
0
def _enforce_schema(pdf: PyFuncInput, input_schema: Schema):
    """
    Enforce column names and types match the input schema.
    For column names, we check there are no missing columns and reorder the columns to match the
    ordering declared in schema if necessary. Any extra columns are ignored.
    For column types, we make sure the types match schema or can be safely converted to match
    the input schema.
    """
    if isinstance(pdf, (list, np.ndarray, dict)):
        try:
            pdf = pd.DataFrame(pdf)
        except Exception as e:
            message = (
                "This model contains a model signature, which suggests a DataFrame input."
                "There was an error casting the input data to a DataFrame: {0}"
                .format(str(e)))
            raise ClearboxWrapperException(message)
    if not isinstance(pdf, pd.DataFrame):
        message = ("Expected input to be DataFrame or list. Found: %s" %
                   type(pdf).__name__)
        raise ClearboxWrapperException(message)

    if input_schema.has_column_names():
        # make sure there are no missing columns
        col_names = input_schema.column_names()
        expected_names = set(col_names)
        actual_names = set(pdf.columns)
        missing_cols = expected_names - actual_names
        extra_cols = actual_names - expected_names
        # Preserve order from the original columns, since missing/extra columns are likely to
        # be in same order.
        missing_cols = [c for c in col_names if c in missing_cols]
        extra_cols = [c for c in pdf.columns if c in extra_cols]
        if missing_cols:
            message = ("Model input is missing columns {0}."
                       " Note that there were extra columns: {1}".format(
                           missing_cols, extra_cols))
            raise ClearboxWrapperException(message)
    else:
        # The model signature does not specify column names => we can only verify column count.
        if len(pdf.columns) < len(input_schema.columns):
            message = (
                "Model input is missing input columns. The model signature declares "
                "{0} input columns but the provided input only has "
                "{1} columns. Note: the columns were not named in the signature so we can "
                "only verify their count.").format(len(input_schema.columns),
                                                   len(pdf.columns))
            raise ClearboxWrapperException(message)
        col_names = pdf.columns[:len(input_schema.columns)]
    col_types = input_schema.column_types()
    new_pdf = pd.DataFrame()
    for i, x in enumerate(col_names):
        new_pdf[x] = _enforce_type(x, pdf[x], col_types[i])
    return new_pdf
예제 #3
0
def _serialize_and_save_model(sk_model: Any, output_path: str,
                              serialization_format: str) -> None:
    """Serialize and save a Scikit-Learn model to a local file.

    Parameters
    ----------
    sk_model : Any
        The Scikit-Learn model to serialize.
    output_path : str
        The file path to which to write the serialized model (.pkl).
    serialization_format : str
        The format in which to serialize the model. This should be one of the following:
        SERIALIZATION_FORMAT_PICKLE or SERIALIZATION_FORMAT_CLOUDPICKLE.

    Raises
    ------
    ClearboxWrapperException
        Unrecognized serialization format.
    """

    with open(output_path, "wb") as out:
        if serialization_format == SERIALIZATION_FORMAT_PICKLE:
            pickle.dump(sk_model, out)
        elif serialization_format == SERIALIZATION_FORMAT_CLOUDPICKLE:
            import cloudpickle

            cloudpickle.dump(sk_model, out)
        else:
            raise ClearboxWrapperException(
                "Unrecognized serialization format: {serialization_format}".
                format(serialization_format=serialization_format))
예제 #4
0
 def __init__(self, cols: List[ColumnSpec]):
     if not (all(map(lambda x: x.name is None, cols))
             or all(map(lambda x: x.name is not None, cols))):
         raise ClearboxWrapperException(
             "Creating Schema with a combination of named and unnamed columns "
             "is not allowed. Got column names {}".format(
                 [x.name for x in cols]))
     self._cols = cols
예제 #5
0
def _validate_db_type_string(db_type):
    """validates db_type parsed from DB URI is supported"""
    if db_type not in DATABASE_ENGINES:
        error_msg = "Invalid database engine: '%s'. '%s'" % (
            db_type,
            _UNSUPPORTED_DB_TYPE_MSG,
        )
        raise ClearboxWrapperException(error_msg)
예제 #6
0
def _load_clearbox(path):
    """
    Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``.

    :param path: Local filesystem path to the MLflow Model with the ``keras`` flavor.
    """
    import tensorflow as tf

    if os.path.isfile(os.path.join(path, _KERAS_MODULE_SPEC_PATH)):
        with open(os.path.join(path, _KERAS_MODULE_SPEC_PATH), "r") as f:
            keras_module = importlib.import_module(f.read())
    else:
        import keras

        keras_module = keras

    # By default, we assume the save_format is h5 for backwards compatibility
    save_format = "h5"
    save_format_path = os.path.join(path, _KERAS_SAVE_FORMAT_PATH)
    if os.path.isfile(save_format_path):
        with open(save_format_path, "r") as f:
            save_format = f.read()

    # In SavedModel format, if we don't compile the model
    should_compile = save_format == "tf"
    K = importlib.import_module(keras_module.__name__ + ".backend")
    if keras_module.__name__ == "tensorflow.keras" or K.backend(
    ) == "tensorflow":
        if LooseVersion(tf.__version__) < LooseVersion("2.0.0"):
            graph = tf.Graph()
            sess = tf.Session(graph=graph)
            # By default tf backed models depend on the global graph and session.
            # We create an use new Graph and Session and store them with the model
            # This way the model is independent on the global state.
            with graph.as_default():
                with sess.as_default():  # pylint:disable=not-context-manager
                    K.set_learning_phase(0)
                    m = _load_model(
                        path,
                        keras_module=keras_module,
                        save_format=save_format,
                        compile=should_compile,
                    )
                    return _KerasModelWrapper(m, graph, sess)
        else:
            K.set_learning_phase(0)
            m = _load_model(
                path,
                keras_module=keras_module,
                save_format=save_format,
                compile=should_compile,
            )
            return _KerasModelWrapper(m, None, None)

    else:
        raise ClearboxWrapperException("Unsupported backend '%s'" % K._BACKEND)
예제 #7
0
    def __init__(
        self,
        model_meta: Model,
        model_impl: Any,
        preprocessing: Any = None,
        data_preparation: Any = None,
    ):
        if not hasattr(model_impl, "predict"):
            raise ClearboxWrapperException(
                "Model implementation is missing required predict method.")
        if not model_meta:
            raise ClearboxWrapperException("Model is missing metadata.")
        if data_preparation is not None and preprocessing is None:
            raise ValueError(
                "Attribute 'preprocessing' is None but attribute "
                "'data_preparation' is not None. If you have a single step "
                "preprocessing, pass it as attribute 'preprocessing'")

        self._model_meta = model_meta
        self._model_impl = model_impl
        self._preprocessing = preprocessing
        self._data_preparation = data_preparation
예제 #8
0
def _load_model(path, **kwargs):
    """
    :param path: The path to a serialized PyTorch model.
    :param kwargs: Additional kwargs to pass to the PyTorch ``torch.load`` function.
    """
    import torch

    if os.path.isdir(path):
        # `path` is a directory containing a serialized PyTorch model and a text file containing
        # information about the pickle module that should be used by PyTorch to load it
        model_path = os.path.join(path, "model.pth")
        pickle_module_path = os.path.join(path, _PICKLE_MODULE_INFO_FILE_NAME)
        with open(pickle_module_path, "r") as f:
            pickle_module_name = f.read()
        if (
            "pickle_module" in kwargs
            and kwargs["pickle_module"].__name__ != pickle_module_name
        ):
            logger.warning(
                "Attempting to load the PyTorch model with a pickle module, '%s', that does not"
                " match the pickle module that was used to save the model: '%s'.",
                kwargs["pickle_module"].__name__,
                pickle_module_name,
            )
        else:
            try:
                kwargs["pickle_module"] = importlib.import_module(pickle_module_name)
            except ImportError as exc:
                raise ClearboxWrapperException(
                    message=(
                        "Failed to import the pickle module that was used to save the PyTorch"
                        " model. Pickle module name: `{pickle_module_name}`".format(
                            pickle_module_name=pickle_module_name
                        )
                    )
                ) from exc

    else:
        model_path = path

    if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"):
        return torch.load(model_path, **kwargs)
    else:
        try:
            # load the model as an eager model.
            return torch.load(model_path, **kwargs)
        except Exception:
            # If fails, assume the model as a scripted model
            return torch.jit.load(model_path)
예제 #9
0
def _get_flavor_configuration(model_path: str, flavor_name: str) -> Dict:
    """Get the configuration for a specified flavor of a model.

    Parameters
    ----------
    model_path : str
        Path to the model directory.
    flavor_name : str
        Name of the flavor configuration to load.

    Returns
    -------
    Dict
        Flavor configuration as a dictionary.

    Raises
    ------
    ClearboxWrapperException
        If it couldn't find a MLmodel file or if the model doesn't contain
        the specified flavor.
    """
    mlmodel_path = os.path.join(model_path, MLMODEL_FILE_NAME)
    if not os.path.exists(mlmodel_path):
        raise ClearboxWrapperException(
            'Could not find an "{}" configuration file at "{}"'.format(
                MLMODEL_FILE_NAME, model_path
            )
        )

    mlmodel = Model.load(mlmodel_path)
    if flavor_name not in mlmodel.flavors:
        raise ClearboxWrapperException(
            'Model does not have the "{}" flavor'.format(flavor_name)
        )
    flavor_configuration_dict = mlmodel.flavors[flavor_name]
    return flavor_configuration_dict
예제 #10
0
 def __init__(self,
              type: DataType,
              name: Optional[str] = None,
              has_nans: bool = False):
     self._name = name
     self._has_nans = has_nans
     try:
         self._type = DataType[type] if isinstance(type, str) else type
     except KeyError:
         raise ClearboxWrapperException(
             "Unsupported type '{0}', expected instance of DataType or "
             "one of {1}".format(type, [t.name for t in DataType]))
     if not isinstance(self.type, DataType):
         raise TypeError("Expected Datatype or str for the 'type' "
                         "argument, but got {}".format(self.type.__class__))
예제 #11
0
def _infer_numpy_dtype(dtype: np.dtype) -> DataType:
    """Infer DataType from numpy dtype.

    Parameters
    ----------
    dtype : np.dtype
        Numpy dtype

    Returns
    -------
    DataType
        Inferred DataType.

    Raises
    ------
    TypeError
        If type of `dtype` is not numpy.dtype.
    Exception
        If `dtype.kind`=='O'
    ClearboxWrapperException
        If `dtype` is unsupported.
    """
    if not isinstance(dtype, np.dtype):
        raise TypeError("Expected numpy.dtype, got '{}'.".format(type(dtype)))
    if dtype.kind == "b":
        return DataType.boolean
    elif dtype.kind == "i" or dtype.kind == "u":
        if dtype.itemsize < 4 or (dtype.kind == "i" and dtype.itemsize == 4):
            return DataType.integer
        elif dtype.itemsize < 8 or (dtype.kind == "i" and dtype.itemsize == 8):
            return DataType.long
    elif dtype.kind == "f":
        if dtype.itemsize <= 4:
            return DataType.float
        elif dtype.itemsize <= 8:
            return DataType.double

    elif dtype.kind == "U":
        return DataType.string
    elif dtype.kind == "S":
        return DataType.binary
    elif dtype.kind == "O":
        raise Exception(
            "Can not infer np.object without looking at the values, call "
            "_infer_numpy_array instead.")
    raise ClearboxWrapperException(
        "Unsupported numpy data type '{0}', kind '{1}'".format(
            dtype, dtype.kind))
예제 #12
0
def load_model(model_path: str,
               suppress_warnings: bool = False) -> PyFuncModel:
    """Load a model that has python_function flavor.

    Parameters
    ----------
    model_path : str
        Filepath of the model directory.
    suppress_warnings : bool, optional
        If Fatal, non-fatal warning messages associated with the model loading process
        will be emitted, by default True

    Returns
    -------
    PyFuncModel
        A python_function model.

    Raises
    ------
    ClearboxWrapperException
        If the model does not have the python_function flavor.
    """
    mlmodel = Model.load(os.path.join(model_path, MLMODEL_FILE_NAME))
    pyfunc_flavor_configuration = mlmodel.flavors.get(FLAVOR_NAME)

    if pyfunc_flavor_configuration is None:
        raise ClearboxWrapperException(
            'Model does not have the "{flavor_name}" flavor'.format(
                flavor_name=FLAVOR_NAME))

    model_python_version = pyfunc_flavor_configuration.get(PY_VERSION)

    if not suppress_warnings:
        _warn_potentially_incompatible_py_version_if_necessary(
            model_py_version=model_python_version)

    if CODE in pyfunc_flavor_configuration and pyfunc_flavor_configuration[
            CODE]:
        code_path = os.path.join(model_path, pyfunc_flavor_configuration[CODE])
        _add_code_to_system_path(code_path=code_path)

    data_path = (os.path.join(model_path, pyfunc_flavor_configuration[DATA]) if
                 (DATA in pyfunc_flavor_configuration) else model_path)

    model_implementation = importlib.import_module(
        pyfunc_flavor_configuration[MAIN])._load_pyfunc(data_path)

    return PyFuncModel(model_meta=mlmodel, model_impl=model_implementation)
예제 #13
0
def get_artifact_repository(self, artifact_uri):
    """Get an artifact repository from the registry based on the scheme of artifact_uri
    :param store_uri: The store URI. This URI is used to select which artifact repository
                      implementation to instantiate and is passed to the
                      constructor of the implementation.
    :return: An instance of `mlflow.store.ArtifactRepository` that fulfills the artifact URI
             requirements.
    """
    scheme = get_uri_scheme(artifact_uri)
    repository = self._registry.get(scheme)
    if repository is None:
        raise ClearboxWrapperException(
            "Could not find a registered artifact repository for: {}. "
            "Currently registered schemes are: {}".format(
                artifact_uri, list(self._registry.keys())))
    return repository(artifact_uri)
예제 #14
0
def extract_db_type_from_uri(db_uri):
    """
    Parse the specified DB URI to extract the database type. Confirm the database type is
    supported. If a driver is specified, confirm it passes a plausible regex.
    """
    scheme = urllib.parse.urlparse(db_uri).scheme
    scheme_plus_count = scheme.count("+")

    if scheme_plus_count == 0:
        db_type = scheme
    elif scheme_plus_count == 1:
        db_type, _ = scheme.split("+")
    else:
        error_msg = "Invalid database URI: '%s'. %s" % (db_uri,
                                                        _INVALID_DB_URI_MSG)
        raise ClearboxWrapperException(error_msg)

    _validate_db_type_string(db_type)

    return db_type
예제 #15
0
def _load_pyfunc(model_path):
    pyfunc_config = _get_flavor_configuration(model_path=model_path,
                                              flavor_name=FLAVOR_NAME)

    python_model_cloudpickle_version = pyfunc_config.get(
        CONFIG_KEY_CLOUDPICKLE_VERSION, None)
    if python_model_cloudpickle_version is None:
        logger.warning(
            "The version of CloudPickle used to save the model could not be found in"
            " the MLmodel configuration")
    elif python_model_cloudpickle_version != cloudpickle.__version__:
        # CloudPickle does not have a well-defined cross-version compatibility policy. Micro
        # version releases have been known to cause incompatibilities. Therefore, we match
        # on the full library version
        logger.warning(
            "The version of CloudPickle that was used to save the model, `CloudPickle %s`,"
            " differs from the version of CloudPickle that is currently running,"
            " `CloudPickle %s`, and may be incompatible",
            python_model_cloudpickle_version,
            cloudpickle.__version__,
        )

    python_model_subpath = pyfunc_config.get(CONFIG_KEY_PYTHON_MODEL, None)
    if python_model_subpath is None:
        raise ClearboxWrapperException(
            "Python model path was not specified in the model configuration")
    with open(os.path.join(model_path, python_model_subpath), "rb") as f:
        python_model = cloudpickle.load(f)

    artifacts = {}
    for saved_artifact_name, saved_artifact_info in pyfunc_config.get(
            CONFIG_KEY_ARTIFACTS, {}).items():
        artifacts[saved_artifact_name] = os.path.join(
            model_path, saved_artifact_info[CONFIG_KEY_ARTIFACT_RELATIVE_PATH])

    context = PythonModelContext(artifacts=artifacts)
    python_model.load_context(context=context)
    return _PythonModelPyfuncWrapper(python_model=python_model,
                                     context=context)
예제 #16
0
def _load_serialized_model(serialized_model_path: str,
                           serialization_format: str) -> Any:
    """Load a serialized (through pickle or cloudpickle) Scikit-Learn model.

    Parameters
    ----------
    serialized_model_path : str
        File path to the Scikit-Learn serialized model.
    serialization_format : str
        Format in which the model was serialized: SERIALIZATION_FORMAT_PICKLE or
        SERIALIZATION_FORMAT_CLOUDPICKLE

    Returns
    -------
    Any
        A Scikit-Learn model.

    Raises
    ------
    ClearboxWrapperException
        If Unrecognized serialization format.
    """
    # TODO: we could validate the scikit-learn version here
    if serialization_format not in SUPPORTED_SERIALIZATION_FORMATS:
        raise ClearboxWrapperException(
            "Unrecognized serialization format: {serialization_format}. Please specify one"
            " of the following supported formats: {supported_formats}.".format(
                serialization_format=serialization_format,
                supported_formats=SUPPORTED_SERIALIZATION_FORMATS,
            ))
    with open(serialized_model_path, "rb") as f:
        # Models serialized with Cloudpickle cannot necessarily be deserialized using Pickle;
        if serialization_format == SERIALIZATION_FORMAT_PICKLE:
            return pickle.load(f)
        elif serialization_format == SERIALIZATION_FORMAT_CLOUDPICKLE:
            import cloudpickle

            return cloudpickle.load(f)
예제 #17
0
    def predict_proba(self,
                      data: WrapperInput,
                      preprocess: bool = True,
                      prepare_data: bool = True) -> WrapperOutput:
        if not hasattr(self._model_impl, "predict_proba"):
            raise ClearboxWrapperException(
                "This model has no predict_proba method.")

        if prepare_data and self._data_preparation is not None:
            data = self._data_preparation.prepare_data(data)
        elif not prepare_data:
            logger.warning(
                "This model has data preparation and you're bypassing it,"
                " this can lead to unexpected results.")

        if preprocess and self._preprocessing is not None:
            data = self._preprocessing.preprocess(data)
        elif not preprocess:
            logger.warning(
                "This model has preprocessing and you're bypassing it,"
                " this can lead to unexpected results.")

        return self._model_impl.predict_proba(data)
def create_and_save_data_preparation(data_preparation_function: Callable,
                                     path: str) -> None:
    """Create, serialize and save a DataPreparation instance.

    Parameters
    ----------
    data_preparation_function : Callable
        A function to use as data preparation. You can use your own custom code for
        data preparation, but it must be wrapped in a single function.

        NOTE: If the data preparation includes any kind of fitting on the training dataset
        (e.g. Scikit Learn transformers), it must be performed outside the final data
        preparation function to save. Fit the transformer(s) outside the function and put
        only the transform method inside it. Furthermore, if the entire data preparation
        is performed with a single Scikit-Learn transformer, you can directly pass it
        (fitted) to this method.
    path : str
        Local path to save the data preparation to.

    Raises
        ------
        TypeError
            If data_preparation_function is not a function (Callable type)
        ClearboxWrapperException
            If data preparation path already exists.
    """
    if not isinstance(data_preparation_function, Callable):
        raise TypeError(
            "data_preparation_function should be a Callable, got '{}'".format(
                type(data_preparation_function)))
    if os.path.exists(path):
        raise ClearboxWrapperException(
            "Data preparation path '{}' already exists".format(path))

    data_preparation = DataPreparation(data_preparation_function)
    with open(path, "wb") as data_preparation_serialized_file:
        cloudpickle.dump(data_preparation, data_preparation_serialized_file)
예제 #19
0
def save_sklearn_model(
    sk_model: Any,
    path: str,
    conda_env: Optional[Union[str, Dict]] = None,
    mlmodel: Optional[Model] = None,
    serialization_format: str = SERIALIZATION_FORMAT_CLOUDPICKLE,
    signature: Optional[Signature] = None,
    add_clearbox_flavor: bool = False,
    preprocessing_subpath: str = None,
    data_preparation_subpath: str = None,
):
    """Save a Scikit-Learn model. Produces an MLflow Model containing the following flavors:
        * wrapper.sklearn
        * wrapper.pyfunc. NOTE: This flavor is only included for scikit-learn models
          that define at least `predict()`, since `predict()` is required for pyfunc model
          inference.

    Parameters
    ----------
    sk_model : Any
        A Scikit-Learn model to be saved.
    path : str
        Local path to save the model to.
    conda_env : Optional[Union[str, Dict]], optional
        A dictionary representation of a Conda environment or the path to a Conda environment
        YAML file, by default None. This decsribes the environment this model should be run in.
        If None, the default Conda environment will be added to the model. Example of a
        dictionary representation of a Conda environment:
        {
            'name': 'conda-env',
            'channels': ['defaults'],
            'dependencies': [
                'python=3.7.0',
                'scikit-learn=0.19.2'
            ]
        }
    serialization_format : str, optional
        The format in which to serialize the model. This should be one of the formats listed in
        SUPPORTED_SERIALIZATION_FORMATS. Cloudpickle format, SERIALIZATION_FORMAT_CLOUDPICKLE,
        provides better cross-system compatibility by identifying and packaging code
        dependencies with the serialized model, by default SERIALIZATION_FORMAT_CLOUDPICKLE
    signature : Optional[Signature], optional
        A model signature describes model input schema. It can be inferred from datasets with
        valid model type (e.g. the training dataset with target column omitted), by default None

    Raises
    ------
    ClearboxWrapperException
        If unrecognized serialization format or model path already exists.
    """
    import sklearn

    if serialization_format not in SUPPORTED_SERIALIZATION_FORMATS:
        raise ClearboxWrapperException(
            "Unrecognized serialization format: {serialization_format}. Please specify one"
            " of the following supported formats: {supported_formats}.".format(
                serialization_format=serialization_format,
                supported_formats=SUPPORTED_SERIALIZATION_FORMATS,
            ))

    if os.path.exists(path):
        raise ClearboxWrapperException(
            "Model path '{}' already exists".format(path))

    os.makedirs(path)
    if mlmodel is None:
        mlmodel = Model()

    if signature is not None:
        mlmodel.signature = signature

    model_data_subpath = "model.pkl"

    _serialize_and_save_model(
        sk_model=sk_model,
        output_path=os.path.join(path, model_data_subpath),
        serialization_format=serialization_format,
    )

    conda_env_subpath = "conda.yaml"
    if conda_env is None:
        conda_env = get_default_sklearn_conda_env(
            include_cloudpickle=serialization_format ==
            SERIALIZATION_FORMAT_CLOUDPICKLE)
    elif not isinstance(conda_env, dict):
        with open(conda_env, "r") as f:
            conda_env = yaml.safe_load(f)

    with open(os.path.join(path, conda_env_subpath), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    # `PyFuncModel` only works for sklearn models that define `predict()`.
    if hasattr(sk_model, "predict"):
        pyfunc.add_pyfunc_flavor_to_model(
            mlmodel,
            loader_module="clearbox_wrapper.sklearn",
            model_path=model_data_subpath,
            env=conda_env_subpath,
        )

    if add_clearbox_flavor:
        add_clearbox_flavor_to_model(
            mlmodel,
            loader_module="clearbox_wrapper.sklearn",
            model_path=model_data_subpath,
            env=conda_env_subpath,
            preprocessing=preprocessing_subpath,
            data_preparation=data_preparation_subpath,
        )

    mlmodel.add_flavor(
        FLAVOR_NAME,
        model_path=model_data_subpath,
        sklearn_version=sklearn.__version__,
        serialization_format=serialization_format,
    )

    mlmodel.save(os.path.join(path, MLMODEL_FILE_NAME))
예제 #20
0
def save_keras_model(keras_model: Any,
                     path: str,
                     conda_env: Optional[Union[str, Dict]] = None,
                     mlmodel: Optional[Model] = None,
                     signature: Optional[Signature] = None,
                     add_clearbox_flavor: bool = False,
                     preprocessing_subpath: str = None,
                     data_preparation_subpath: str = None,
                     keras_module: str = None,
                     custom_objects=None,
                     **kwargs):
    if keras_module is None:

        def _is_plain_keras(model):
            try:
                import keras

                if LooseVersion(keras.__version__) < LooseVersion("2.2.0"):
                    import keras.engine

                    return isinstance(model, keras.engine.Model)
                else:
                    # NB: Network is the first parent with save method
                    import keras.engine.network

                    return isinstance(model, keras.engine.network.Network)
            except ImportError:
                return False

        def _is_tf_keras(model):
            try:
                # NB: Network is not exposed in tf.keras, we check for Model instead.
                import tensorflow.keras.models

                return isinstance(model, tensorflow.keras.models.Model)
            except ImportError:
                return False

        if _is_plain_keras(keras_model):
            keras_module = importlib.import_module("keras")
        elif _is_tf_keras(keras_model):
            keras_module = importlib.import_module("tensorflow.keras")
        else:
            raise ClearboxWrapperException(
                "Unable to infer keras module from the model, please specify "
                "which keras module ('keras' or 'tensorflow.keras') is to be "
                "used to save and load the model.")
    elif type(keras_module) == str:
        keras_module = importlib.import_module(keras_module)

    if os.path.exists(path):
        raise ClearboxWrapperException(
            "Model path '{}' already exists".format(path))

    data_subpath = "data"
    data_path = os.path.join(path, data_subpath)
    os.makedirs(data_path)

    if mlmodel is None:
        mlmodel = Model()
    if signature is not None:
        mlmodel.signature = signature

    if custom_objects is not None:
        _save_custom_objects(data_path, custom_objects)

    # save keras module spec to path/data/keras_module.txt
    with open(os.path.join(data_path, _KERAS_MODULE_SPEC_PATH), "w") as f:
        f.write(keras_module.__name__)

    # Use the SavedModel format if `save_format` is unspecified
    save_format = kwargs.get("save_format", "tf")

    # save keras save_format to path/data/save_format.txt
    with open(os.path.join(data_path, _KERAS_SAVE_FORMAT_PATH), "w") as f:
        f.write(save_format)

    # save keras model
    # To maintain prior behavior, when the format is HDF5, we save
    # with the h5 file extension. Otherwise, model_path is a directory
    # where the saved_model.pb will be stored (for SavedModel format)
    file_extension = ".h5" if save_format == "h5" else ""
    model_subpath = os.path.join(data_subpath, _MODEL_SAVE_PATH)
    model_path = os.path.join(path, model_subpath) + file_extension
    keras_model.save(model_path, **kwargs)

    conda_env_subpath = "conda.yaml"
    if conda_env is None:
        conda_env = get_default_keras_conda_env(
            include_cloudpickle=custom_objects is not None,
            keras_module=keras_module)
    elif not isinstance(conda_env, dict):
        with open(conda_env, "r") as f:
            conda_env = yaml.safe_load(f)

    with open(os.path.join(path, conda_env_subpath), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    mlmodel.add_flavor(
        FLAVOR_NAME,
        keras_module=keras_module.__name__,
        keras_version=keras_module.__version__,
        save_format=save_format,
        data=data_subpath,
    )

    pyfunc.add_pyfunc_flavor_to_model(
        mlmodel,
        loader_module="clearbox_wrapper.keras",
        data=data_subpath,
        env=conda_env_subpath,
    )

    if add_clearbox_flavor:
        add_clearbox_flavor_to_model(
            mlmodel,
            loader_module="clearbox_wrapper.keras",
            data=data_subpath,
            env=conda_env_subpath,
            preprocessing=preprocessing_subpath,
            data_preparation=data_preparation_subpath,
        )

    mlmodel.save(os.path.join(path, MLMODEL_FILE_NAME))
예제 #21
0
    def predict_proba(self, dataframe):
        if not hasattr(self.xgb_model, "predict_proba"):
            raise ClearboxWrapperException("This model has no predict_proba method.")
        import xgboost as xgb

        return self.xgb_model.predict_proba(xgb.DMatrix(dataframe))
예제 #22
0
def _save_model_with_class_artifacts_params(
    path,
    python_model,
    artifacts=None,
    conda_env=None,
    code_paths=None,
    mlflow_model=new_model,
):
    """
    :param path: The path to which to save the Python model.
    :param python_model: An instance of a subclass of :class:`~PythonModel`. ``python_model``
                        defines how the model loads artifacts and how it performs inference.
    :param artifacts: A dictionary containing ``<name, artifact_uri>`` entries.
                      Remote artifact URIs
                      are resolved to absolute filesystem paths, producing a dictionary of
                      ``<name, absolute_path>`` entries. ``python_model`` can reference these
                      resolved entries as the ``artifacts`` property of the ``context``
                      attribute. If ``None``, no artifacts are added to the model.
    :param conda_env: Either a dictionary representation of a Conda environment or the
                      path to a Conda environment yaml file. If provided, this decsribes the
                      environment this model should be run in. At minimum, it should specify
                      the dependencies
                      contained in :func:`get_default_conda_env()`. If ``None``, the default
                      :func:`get_default_conda_env()` environment is added to the model.
    :param code_paths: A list of local filesystem paths to Python file dependencies (or
                       directories containing file dependencies). These files are *prepended*
                       to the system path before the model is loaded.
    :param mlflow_model: The model configuration to which to add the ``mlflow.pyfunc`` flavor.
    """
    custom_model_config_kwargs = {
        CONFIG_KEY_CLOUDPICKLE_VERSION: cloudpickle.__version__,
    }
    if isinstance(python_model, PythonModel):
        saved_python_model_subpath = "python_model.pkl"
        with open(os.path.join(path, saved_python_model_subpath), "wb") as out:
            cloudpickle.dump(python_model, out)
        custom_model_config_kwargs[
            CONFIG_KEY_PYTHON_MODEL] = saved_python_model_subpath
    else:
        raise ClearboxWrapperException(
            "`python_model` must be a subclass of `PythonModel`. Instead, found an"
            " object of type: {python_model_type}".format(
                python_model_type=type(python_model)))

    if artifacts:
        saved_artifacts_config = {}
        with TempDir() as tmp_artifacts_dir:
            tmp_artifacts_config = {}
            saved_artifacts_dir_subpath = "artifacts"
            for artifact_name, artifact_uri in artifacts.items():
                tmp_artifact_path = _download_artifact_from_uri(
                    artifact_uri=artifact_uri,
                    output_path=tmp_artifacts_dir.path())
                tmp_artifacts_config[artifact_name] = tmp_artifact_path
                saved_artifact_subpath = posixpath.join(
                    saved_artifacts_dir_subpath,
                    os.path.relpath(path=tmp_artifact_path,
                                    start=tmp_artifacts_dir.path()),
                )
                saved_artifacts_config[artifact_name] = {
                    CONFIG_KEY_ARTIFACT_RELATIVE_PATH: saved_artifact_subpath,
                    CONFIG_KEY_ARTIFACT_URI: artifact_uri,
                }

            shutil.move(
                tmp_artifacts_dir.path(),
                os.path.join(path, saved_artifacts_dir_subpath),
            )
        custom_model_config_kwargs[
            CONFIG_KEY_ARTIFACTS] = saved_artifacts_config

    conda_env_subpath = "conda.yaml"
    if conda_env is None:
        conda_env = get_default_conda_env()
    elif not isinstance(conda_env, dict):
        with open(conda_env, "r") as f:
            conda_env = yaml.safe_load(f)
    with open(os.path.join(path, conda_env_subpath), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    saved_code_subpath = None
    if code_paths is not None:
        saved_code_subpath = "code"
        for code_path in code_paths:
            _copy_file_or_tree(src=code_path,
                               dst=path,
                               dst_dir=saved_code_subpath)

    add_pyfunc_flavor_to_model(model=mlflow_model,
                               loader_module=__name__,
                               code=saved_code_subpath,
                               env=conda_env_subpath,
                               **custom_model_config_kwargs)
    mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
예제 #23
0
 def preprocess_data(self, data: WrapperInput) -> WrapperOutput:
     if self._preprocessing is None:
         raise ClearboxWrapperException("This model has no preprocessing.")
     return self._preprocessing.preprocess(data)
예제 #24
0
 def prepare_data(self, data: WrapperInput) -> WrapperOutput:
     if self._data_preparation is None:
         raise ClearboxWrapperException(
             "This model has no data preparation.")
     return self._data_preparation.prepare_data(data)
예제 #25
0
 def save(self, path: str) -> None:
     if os.path.exists(path):
         raise ClearboxWrapperException(
             "Preprocessing path '{}' already exists".format(path))
     with open(path, "wb") as preprocessing_serialized_file:
         cloudpickle.dump(self, preprocessing_serialized_file)
예제 #26
0
def load_model(model_path: str,
               suppress_warnings: bool = False) -> WrapperModel:
    """Load a model that has python_function flavor.

    Parameters
    ----------
    model_path : str
        Filepath of the model directory.
    suppress_warnings : bool, optional
        If Fatal, non-fatal warning messages associated with the model loading process
        will be emitted, by default True

    Returns
    -------
    PyFuncModel
        A python_function model.

    Raises
    ------
    ClearboxWrapperException
        If the model does not have the python_function flavor.
    """
    preprocessing = None
    data_preparation = None

    mlmodel = Model.load(os.path.join(model_path, MLMODEL_FILE_NAME))
    clearbox_flavor_configuration = mlmodel.flavors.get(FLAVOR_NAME)

    if clearbox_flavor_configuration is None:
        raise ClearboxWrapperException(
            'Model does not have the "{flavor_name}" flavor'.format(
                flavor_name=FLAVOR_NAME))

    model_python_version = clearbox_flavor_configuration.get(PY_VERSION)

    if not suppress_warnings:
        _warn_potentially_incompatible_py_version_if_necessary(
            model_py_version=model_python_version)

    data_path = (os.path.join(model_path, clearbox_flavor_configuration[DATA])
                 if (DATA in clearbox_flavor_configuration) else model_path)

    model_implementation = importlib.import_module(
        clearbox_flavor_configuration[MAIN])._load_clearbox(data_path)

    if PREPROCESSING in clearbox_flavor_configuration:
        preprocessing_path = os.path.join(
            model_path, clearbox_flavor_configuration[PREPROCESSING])
        preprocessing = load_serialized_preprocessing(preprocessing_path)

    if DATA_PREPARATION in clearbox_flavor_configuration:
        data_preparation_path = os.path.join(
            model_path, clearbox_flavor_configuration[DATA_PREPARATION])
        data_preparation = load_serialized_data_preparation(
            data_preparation_path)

    loaded_model = WrapperModel(
        model_meta=mlmodel,
        model_impl=model_implementation,
        preprocessing=preprocessing,
        data_preparation=data_preparation,
    )

    return loaded_model
예제 #27
0
def save_model(
    path: str,
    model: Any,
    input_data: Optional[WrapperInput] = None,
    preprocessing: Optional[Callable] = None,
    data_preparation: Optional[Callable] = None,
    additional_deps: Optional[List] = None,
    zip: bool = True,
) -> None:

    path_check = path + ".zip" if zip else path
    if os.path.exists(path_check):
        raise ClearboxWrapperException(
            "Model path '{}' already exists".format(path))

    mlmodel = Model()
    saved_preprocessing_subpath = None
    saved_data_preparation_subpath = None

    if data_preparation is not None and preprocessing is None:
        raise ValueError(
            "Attribute 'preprocessing' is None but attribute "
            "'data_preparation' is not None. If you have a single step "
            "preprocessing, pass it as attribute 'preprocessing'")

    if data_preparation and preprocessing:
        preparation = DataPreparation(data_preparation)
        data_preprocessing = Preprocessing(preprocessing)
        saved_data_preparation_subpath = "data_preparation.pkl"
        saved_preprocessing_subpath = "preprocessing.pkl"
        if input_data is not None:
            if isinstance(input_data,
                          pd.DataFrame) and input_data.shape[0] > 50:
                input_data = input_data.head(50)
            elif isinstance(input_data,
                            np.ndarray) and input_data.shape[0] > 50:
                input_data = input_data[:50, :]

            data_preparation_output = preparation.prepare_data(input_data)
            preprocessing_output = data_preprocessing.preprocess(
                data_preparation_output)
            data_preparation_signature = infer_signature(
                input_data, data_preparation_output)
            preprocessing_signature = infer_signature(data_preparation_output,
                                                      preprocessing_output)
            model_signature = infer_signature(preprocessing_output)
            mlmodel.preparation_signature = data_preparation_signature
            mlmodel.preprocessing_signature = preprocessing_signature
            mlmodel.model_signature = model_signature
    elif preprocessing:
        data_preprocessing = Preprocessing(preprocessing)
        saved_preprocessing_subpath = "preprocessing.pkl"
        if input_data is not None:
            preprocessing_output = data_preprocessing.preprocess(input_data)
            preprocessing_signature = infer_signature(input_data,
                                                      preprocessing_output)
            model_signature = infer_signature(preprocessing_output)
            mlmodel.preprocessing_signature = preprocessing_signature
            mlmodel.model_signature = model_signature
    elif input_data is not None:
        model_signature = infer_signature(input_data)
        mlmodel.model_signature = model_signature

    conda_env = _check_and_get_conda_env(model, additional_deps)
    model_super_classes = get_super_classes_names(model)

    if any("sklearn" in super_class for super_class in model_super_classes):
        save_sklearn_model(
            model,
            path,
            conda_env=conda_env,
            mlmodel=mlmodel,
            add_clearbox_flavor=True,
            preprocessing_subpath=saved_preprocessing_subpath,
            data_preparation_subpath=saved_data_preparation_subpath,
        )
    elif any("xgboost" in super_class for super_class in model_super_classes):
        save_xgboost_model(
            model,
            path,
            conda_env=conda_env,
            mlmodel=mlmodel,
            add_clearbox_flavor=True,
            preprocessing_subpath=saved_preprocessing_subpath,
            data_preparation_subpath=saved_data_preparation_subpath,
        )
    elif any("keras" in super_class for super_class in model_super_classes):
        save_keras_model(
            model,
            path,
            conda_env=conda_env,
            mlmodel=mlmodel,
            add_clearbox_flavor=True,
            preprocessing_subpath=saved_preprocessing_subpath,
            data_preparation_subpath=saved_data_preparation_subpath,
        )
    elif any("torch" in super_class for super_class in model_super_classes):
        save_pytorch_model(
            model,
            path,
            conda_env=conda_env,
            mlmodel=mlmodel,
            add_clearbox_flavor=True,
            preprocessing_subpath=saved_preprocessing_subpath,
            data_preparation_subpath=saved_data_preparation_subpath,
        )

    if preprocessing:
        data_preprocessing.save(os.path.join(path,
                                             saved_preprocessing_subpath))
    if data_preparation:
        preparation.save(os.path.join(path, saved_data_preparation_subpath))
    if zip:
        zip_directory(path)
예제 #28
0
 def metadata(self):
     """Model metadata."""
     if self._model_meta is None:
         raise ClearboxWrapperException("Model is missing metadata.")
     return self._model_meta
예제 #29
0
def save_pytorch_model(
    pytorch_model: Any,
    path: str,
    conda_env: Optional[Union[str, Dict]] = None,
    mlmodel: Optional[Model] = None,
    signature: Optional[Signature] = None,
    add_clearbox_flavor: bool = False,
    preprocessing_subpath: str = None,
    data_preparation_subpath: str = None,
    code_paths=None,
    pickle_module=None,
    requirements_file=None,
    extra_files=None,
    **kwargs
):
    import torch

    pickle_module = pickle_module or clearbox_pytorch_pickle_module
    if not isinstance(pytorch_model, torch.nn.Module):
        raise TypeError("Argument 'pytorch_model' should be a torch.nn.Module")
    if code_paths is not None:
        if not isinstance(code_paths, list):
            raise TypeError(
                "Argument code_paths should be a list, not {}".format(type(code_paths))
            )

    if os.path.exists(path):
        raise ClearboxWrapperException("Model path '{}' already exists".format(path))
    os.makedirs(path)

    if mlmodel is None:
        mlmodel = Model()
    if signature is not None:
        mlmodel.signature = signature

    model_data_subpath = "data"
    model_data_path = os.path.join(path, model_data_subpath)
    os.makedirs(model_data_path)

    # Persist the pickle module name as a file in the model's `data` directory. This is
    # necessary because the `data` directory is the only available parameter to
    # `_load_pyfunc`, and it does not contain the MLmodel configuration; therefore,
    # it is not sufficient to place the module name in the MLmodel
    #
    # TODO: Stop persisting this information to the filesystem once we have a mechanism for
    # supplying the MLmodel configuration to `mlflow.pytorch._load_pyfunc`
    pickle_module_path = os.path.join(model_data_path, _PICKLE_MODULE_INFO_FILE_NAME)
    with open(pickle_module_path, "w") as f:
        f.write(pickle_module.__name__)

    # Save pytorch model
    model_path = os.path.join(model_data_path, _SERIALIZED_TORCH_MODEL_FILE_NAME)
    if isinstance(pytorch_model, torch.jit.ScriptModule):
        torch.jit.ScriptModule.save(pytorch_model, model_path)
    else:
        torch.save(pytorch_model, model_path, pickle_module=pickle_module, **kwargs)

    torchserve_artifacts_config = {}

    if requirements_file:
        if not isinstance(requirements_file, str):
            raise TypeError("Path to requirements file should be a string")

        with TempDir() as tmp_requirements_dir:

            rel_path = os.path.basename(requirements_file)
            torchserve_artifacts_config[_REQUIREMENTS_FILE_KEY] = {"path": rel_path}
            shutil.move(tmp_requirements_dir.path(rel_path), path)

    if extra_files:
        torchserve_artifacts_config[_EXTRA_FILES_KEY] = []
        if not isinstance(extra_files, list):
            raise TypeError("Extra files argument should be a list")

        with TempDir() as tmp_extra_files_dir:
            for extra_file in extra_files:
                rel_path = posixpath.join(
                    _EXTRA_FILES_KEY,
                    os.path.basename(extra_file),
                )
                torchserve_artifacts_config[_EXTRA_FILES_KEY].append({"path": rel_path})
            shutil.move(
                tmp_extra_files_dir.path(),
                posixpath.join(path, _EXTRA_FILES_KEY),
            )

    conda_env_subpath = "conda.yaml"
    if conda_env is None:
        conda_env = get_default_pytorch_conda_env()
    elif not isinstance(conda_env, dict):
        with open(conda_env, "r") as f:
            conda_env = yaml.safe_load(f)
    with open(os.path.join(path, conda_env_subpath), "w") as f:
        yaml.safe_dump(conda_env, stream=f, default_flow_style=False)

    if code_paths is not None:
        code_dir_subpath = "code"
        for code_path in code_paths:
            _copy_file_or_tree(src=code_path, dst=path, dst_dir=code_dir_subpath)
    else:
        code_dir_subpath = None

    mlmodel.add_flavor(
        FLAVOR_NAME,
        model_data=model_data_subpath,
        pytorch_version=torch.__version__,
        **torchserve_artifacts_config,
    )

    pyfunc.add_pyfunc_flavor_to_model(
        mlmodel,
        loader_module="clearbox_wrapper.pytorch",
        data=model_data_subpath,
        pickle_module_name=pickle_module.__name__,
        code=code_dir_subpath,
        env=conda_env_subpath,
    )

    if add_clearbox_flavor:
        add_clearbox_flavor_to_model(
            mlmodel,
            loader_module="clearbox_wrapper.pytorch",
            data=model_data_subpath,
            pickle_module_name=pickle_module.__name__,
            code=code_dir_subpath,
            env=conda_env_subpath,
            preprocessing=preprocessing_subpath,
            data_preparation=data_preparation_subpath,
        )

    mlmodel.save(os.path.join(path, MLMODEL_FILE_NAME))
예제 #30
0
def _infer_numpy_array(col: np.ndarray) -> DataType:
    """Infer DataType of a numpy array.

    Parameters
    ----------
    col : np.ndarray
        Column representation as a numpy array.

    Returns
    -------
    DataType
        Inferred datatype.

    Raises
    ------
    TypeError
        If `col` is not a numpy array.
    ClearboxWrapperException
        If `col` is not a 1D array.
    """
    if not isinstance(col, np.ndarray):
        raise TypeError("Expected numpy.ndarray, got '{}'.".format(type(col)))
    if len(col.shape) > 1:
        raise ClearboxWrapperException(
            "Expected 1d array, got array with shape {}".format(col.shape))

    class IsInstanceOrNone(object):
        def __init__(self, *args):
            self.classes = args
            self.seen_instances = 0

        def __call__(self, x):
            if x is None:
                return True
            elif any(map(lambda c: isinstance(x, c), self.classes)):
                self.seen_instances += 1
                return True
            else:
                return False

    if col.dtype.kind == "O":
        is_binary_test = IsInstanceOrNone(bytes, bytearray)
        if all(map(is_binary_test, col)) and is_binary_test.seen_instances > 0:
            return DataType.binary
        is_string_test = IsInstanceOrNone(str)
        if all(map(is_string_test, col)) and is_string_test.seen_instances > 0:
            return DataType.string
        # NB: bool is also instance of int => boolean test must precede integer test.
        is_boolean_test = IsInstanceOrNone(bool)
        if all(map(is_boolean_test,
                   col)) and is_boolean_test.seen_instances > 0:
            return DataType.boolean
        is_long_test = IsInstanceOrNone(int)
        if all(map(is_long_test, col)) and is_long_test.seen_instances > 0:
            return DataType.long
        is_double_test = IsInstanceOrNone(float)
        if all(map(is_double_test, col)) and is_double_test.seen_instances > 0:
            return DataType.double
        else:
            raise ClearboxWrapperException(
                "Unable to map 'np.object' type to MLflow DataType. np.object can"
                "be mapped iff all values have identical data type which is one "
                "of (string, (bytes or byterray),  int, float).")
    else:
        return _infer_numpy_dtype(col.dtype)