def __init__(self, model_meta: Model, model_impl: Any): if not hasattr(model_impl, "predict"): raise ClearboxWrapperException( "Model implementation is missing required predict method.") if not model_meta: raise ClearboxWrapperException("Model is missing metadata.") self._model_meta = model_meta self._model_impl = model_impl
def _enforce_schema(pdf: PyFuncInput, input_schema: Schema): """ Enforce column names and types match the input schema. For column names, we check there are no missing columns and reorder the columns to match the ordering declared in schema if necessary. Any extra columns are ignored. For column types, we make sure the types match schema or can be safely converted to match the input schema. """ if isinstance(pdf, (list, np.ndarray, dict)): try: pdf = pd.DataFrame(pdf) except Exception as e: message = ( "This model contains a model signature, which suggests a DataFrame input." "There was an error casting the input data to a DataFrame: {0}" .format(str(e))) raise ClearboxWrapperException(message) if not isinstance(pdf, pd.DataFrame): message = ("Expected input to be DataFrame or list. Found: %s" % type(pdf).__name__) raise ClearboxWrapperException(message) if input_schema.has_column_names(): # make sure there are no missing columns col_names = input_schema.column_names() expected_names = set(col_names) actual_names = set(pdf.columns) missing_cols = expected_names - actual_names extra_cols = actual_names - expected_names # Preserve order from the original columns, since missing/extra columns are likely to # be in same order. missing_cols = [c for c in col_names if c in missing_cols] extra_cols = [c for c in pdf.columns if c in extra_cols] if missing_cols: message = ("Model input is missing columns {0}." " Note that there were extra columns: {1}".format( missing_cols, extra_cols)) raise ClearboxWrapperException(message) else: # The model signature does not specify column names => we can only verify column count. if len(pdf.columns) < len(input_schema.columns): message = ( "Model input is missing input columns. The model signature declares " "{0} input columns but the provided input only has " "{1} columns. Note: the columns were not named in the signature so we can " "only verify their count.").format(len(input_schema.columns), len(pdf.columns)) raise ClearboxWrapperException(message) col_names = pdf.columns[:len(input_schema.columns)] col_types = input_schema.column_types() new_pdf = pd.DataFrame() for i, x in enumerate(col_names): new_pdf[x] = _enforce_type(x, pdf[x], col_types[i]) return new_pdf
def _serialize_and_save_model(sk_model: Any, output_path: str, serialization_format: str) -> None: """Serialize and save a Scikit-Learn model to a local file. Parameters ---------- sk_model : Any The Scikit-Learn model to serialize. output_path : str The file path to which to write the serialized model (.pkl). serialization_format : str The format in which to serialize the model. This should be one of the following: SERIALIZATION_FORMAT_PICKLE or SERIALIZATION_FORMAT_CLOUDPICKLE. Raises ------ ClearboxWrapperException Unrecognized serialization format. """ with open(output_path, "wb") as out: if serialization_format == SERIALIZATION_FORMAT_PICKLE: pickle.dump(sk_model, out) elif serialization_format == SERIALIZATION_FORMAT_CLOUDPICKLE: import cloudpickle cloudpickle.dump(sk_model, out) else: raise ClearboxWrapperException( "Unrecognized serialization format: {serialization_format}". format(serialization_format=serialization_format))
def __init__(self, cols: List[ColumnSpec]): if not (all(map(lambda x: x.name is None, cols)) or all(map(lambda x: x.name is not None, cols))): raise ClearboxWrapperException( "Creating Schema with a combination of named and unnamed columns " "is not allowed. Got column names {}".format( [x.name for x in cols])) self._cols = cols
def _validate_db_type_string(db_type): """validates db_type parsed from DB URI is supported""" if db_type not in DATABASE_ENGINES: error_msg = "Invalid database engine: '%s'. '%s'" % ( db_type, _UNSUPPORTED_DB_TYPE_MSG, ) raise ClearboxWrapperException(error_msg)
def _load_clearbox(path): """ Load PyFunc implementation. Called by ``pyfunc.load_pyfunc``. :param path: Local filesystem path to the MLflow Model with the ``keras`` flavor. """ import tensorflow as tf if os.path.isfile(os.path.join(path, _KERAS_MODULE_SPEC_PATH)): with open(os.path.join(path, _KERAS_MODULE_SPEC_PATH), "r") as f: keras_module = importlib.import_module(f.read()) else: import keras keras_module = keras # By default, we assume the save_format is h5 for backwards compatibility save_format = "h5" save_format_path = os.path.join(path, _KERAS_SAVE_FORMAT_PATH) if os.path.isfile(save_format_path): with open(save_format_path, "r") as f: save_format = f.read() # In SavedModel format, if we don't compile the model should_compile = save_format == "tf" K = importlib.import_module(keras_module.__name__ + ".backend") if keras_module.__name__ == "tensorflow.keras" or K.backend( ) == "tensorflow": if LooseVersion(tf.__version__) < LooseVersion("2.0.0"): graph = tf.Graph() sess = tf.Session(graph=graph) # By default tf backed models depend on the global graph and session. # We create an use new Graph and Session and store them with the model # This way the model is independent on the global state. with graph.as_default(): with sess.as_default(): # pylint:disable=not-context-manager K.set_learning_phase(0) m = _load_model( path, keras_module=keras_module, save_format=save_format, compile=should_compile, ) return _KerasModelWrapper(m, graph, sess) else: K.set_learning_phase(0) m = _load_model( path, keras_module=keras_module, save_format=save_format, compile=should_compile, ) return _KerasModelWrapper(m, None, None) else: raise ClearboxWrapperException("Unsupported backend '%s'" % K._BACKEND)
def __init__( self, model_meta: Model, model_impl: Any, preprocessing: Any = None, data_preparation: Any = None, ): if not hasattr(model_impl, "predict"): raise ClearboxWrapperException( "Model implementation is missing required predict method.") if not model_meta: raise ClearboxWrapperException("Model is missing metadata.") if data_preparation is not None and preprocessing is None: raise ValueError( "Attribute 'preprocessing' is None but attribute " "'data_preparation' is not None. If you have a single step " "preprocessing, pass it as attribute 'preprocessing'") self._model_meta = model_meta self._model_impl = model_impl self._preprocessing = preprocessing self._data_preparation = data_preparation
def _load_model(path, **kwargs): """ :param path: The path to a serialized PyTorch model. :param kwargs: Additional kwargs to pass to the PyTorch ``torch.load`` function. """ import torch if os.path.isdir(path): # `path` is a directory containing a serialized PyTorch model and a text file containing # information about the pickle module that should be used by PyTorch to load it model_path = os.path.join(path, "model.pth") pickle_module_path = os.path.join(path, _PICKLE_MODULE_INFO_FILE_NAME) with open(pickle_module_path, "r") as f: pickle_module_name = f.read() if ( "pickle_module" in kwargs and kwargs["pickle_module"].__name__ != pickle_module_name ): logger.warning( "Attempting to load the PyTorch model with a pickle module, '%s', that does not" " match the pickle module that was used to save the model: '%s'.", kwargs["pickle_module"].__name__, pickle_module_name, ) else: try: kwargs["pickle_module"] = importlib.import_module(pickle_module_name) except ImportError as exc: raise ClearboxWrapperException( message=( "Failed to import the pickle module that was used to save the PyTorch" " model. Pickle module name: `{pickle_module_name}`".format( pickle_module_name=pickle_module_name ) ) ) from exc else: model_path = path if LooseVersion(torch.__version__) >= LooseVersion("1.5.0"): return torch.load(model_path, **kwargs) else: try: # load the model as an eager model. return torch.load(model_path, **kwargs) except Exception: # If fails, assume the model as a scripted model return torch.jit.load(model_path)
def _get_flavor_configuration(model_path: str, flavor_name: str) -> Dict: """Get the configuration for a specified flavor of a model. Parameters ---------- model_path : str Path to the model directory. flavor_name : str Name of the flavor configuration to load. Returns ------- Dict Flavor configuration as a dictionary. Raises ------ ClearboxWrapperException If it couldn't find a MLmodel file or if the model doesn't contain the specified flavor. """ mlmodel_path = os.path.join(model_path, MLMODEL_FILE_NAME) if not os.path.exists(mlmodel_path): raise ClearboxWrapperException( 'Could not find an "{}" configuration file at "{}"'.format( MLMODEL_FILE_NAME, model_path ) ) mlmodel = Model.load(mlmodel_path) if flavor_name not in mlmodel.flavors: raise ClearboxWrapperException( 'Model does not have the "{}" flavor'.format(flavor_name) ) flavor_configuration_dict = mlmodel.flavors[flavor_name] return flavor_configuration_dict
def __init__(self, type: DataType, name: Optional[str] = None, has_nans: bool = False): self._name = name self._has_nans = has_nans try: self._type = DataType[type] if isinstance(type, str) else type except KeyError: raise ClearboxWrapperException( "Unsupported type '{0}', expected instance of DataType or " "one of {1}".format(type, [t.name for t in DataType])) if not isinstance(self.type, DataType): raise TypeError("Expected Datatype or str for the 'type' " "argument, but got {}".format(self.type.__class__))
def _infer_numpy_dtype(dtype: np.dtype) -> DataType: """Infer DataType from numpy dtype. Parameters ---------- dtype : np.dtype Numpy dtype Returns ------- DataType Inferred DataType. Raises ------ TypeError If type of `dtype` is not numpy.dtype. Exception If `dtype.kind`=='O' ClearboxWrapperException If `dtype` is unsupported. """ if not isinstance(dtype, np.dtype): raise TypeError("Expected numpy.dtype, got '{}'.".format(type(dtype))) if dtype.kind == "b": return DataType.boolean elif dtype.kind == "i" or dtype.kind == "u": if dtype.itemsize < 4 or (dtype.kind == "i" and dtype.itemsize == 4): return DataType.integer elif dtype.itemsize < 8 or (dtype.kind == "i" and dtype.itemsize == 8): return DataType.long elif dtype.kind == "f": if dtype.itemsize <= 4: return DataType.float elif dtype.itemsize <= 8: return DataType.double elif dtype.kind == "U": return DataType.string elif dtype.kind == "S": return DataType.binary elif dtype.kind == "O": raise Exception( "Can not infer np.object without looking at the values, call " "_infer_numpy_array instead.") raise ClearboxWrapperException( "Unsupported numpy data type '{0}', kind '{1}'".format( dtype, dtype.kind))
def load_model(model_path: str, suppress_warnings: bool = False) -> PyFuncModel: """Load a model that has python_function flavor. Parameters ---------- model_path : str Filepath of the model directory. suppress_warnings : bool, optional If Fatal, non-fatal warning messages associated with the model loading process will be emitted, by default True Returns ------- PyFuncModel A python_function model. Raises ------ ClearboxWrapperException If the model does not have the python_function flavor. """ mlmodel = Model.load(os.path.join(model_path, MLMODEL_FILE_NAME)) pyfunc_flavor_configuration = mlmodel.flavors.get(FLAVOR_NAME) if pyfunc_flavor_configuration is None: raise ClearboxWrapperException( 'Model does not have the "{flavor_name}" flavor'.format( flavor_name=FLAVOR_NAME)) model_python_version = pyfunc_flavor_configuration.get(PY_VERSION) if not suppress_warnings: _warn_potentially_incompatible_py_version_if_necessary( model_py_version=model_python_version) if CODE in pyfunc_flavor_configuration and pyfunc_flavor_configuration[ CODE]: code_path = os.path.join(model_path, pyfunc_flavor_configuration[CODE]) _add_code_to_system_path(code_path=code_path) data_path = (os.path.join(model_path, pyfunc_flavor_configuration[DATA]) if (DATA in pyfunc_flavor_configuration) else model_path) model_implementation = importlib.import_module( pyfunc_flavor_configuration[MAIN])._load_pyfunc(data_path) return PyFuncModel(model_meta=mlmodel, model_impl=model_implementation)
def get_artifact_repository(self, artifact_uri): """Get an artifact repository from the registry based on the scheme of artifact_uri :param store_uri: The store URI. This URI is used to select which artifact repository implementation to instantiate and is passed to the constructor of the implementation. :return: An instance of `mlflow.store.ArtifactRepository` that fulfills the artifact URI requirements. """ scheme = get_uri_scheme(artifact_uri) repository = self._registry.get(scheme) if repository is None: raise ClearboxWrapperException( "Could not find a registered artifact repository for: {}. " "Currently registered schemes are: {}".format( artifact_uri, list(self._registry.keys()))) return repository(artifact_uri)
def extract_db_type_from_uri(db_uri): """ Parse the specified DB URI to extract the database type. Confirm the database type is supported. If a driver is specified, confirm it passes a plausible regex. """ scheme = urllib.parse.urlparse(db_uri).scheme scheme_plus_count = scheme.count("+") if scheme_plus_count == 0: db_type = scheme elif scheme_plus_count == 1: db_type, _ = scheme.split("+") else: error_msg = "Invalid database URI: '%s'. %s" % (db_uri, _INVALID_DB_URI_MSG) raise ClearboxWrapperException(error_msg) _validate_db_type_string(db_type) return db_type
def _load_pyfunc(model_path): pyfunc_config = _get_flavor_configuration(model_path=model_path, flavor_name=FLAVOR_NAME) python_model_cloudpickle_version = pyfunc_config.get( CONFIG_KEY_CLOUDPICKLE_VERSION, None) if python_model_cloudpickle_version is None: logger.warning( "The version of CloudPickle used to save the model could not be found in" " the MLmodel configuration") elif python_model_cloudpickle_version != cloudpickle.__version__: # CloudPickle does not have a well-defined cross-version compatibility policy. Micro # version releases have been known to cause incompatibilities. Therefore, we match # on the full library version logger.warning( "The version of CloudPickle that was used to save the model, `CloudPickle %s`," " differs from the version of CloudPickle that is currently running," " `CloudPickle %s`, and may be incompatible", python_model_cloudpickle_version, cloudpickle.__version__, ) python_model_subpath = pyfunc_config.get(CONFIG_KEY_PYTHON_MODEL, None) if python_model_subpath is None: raise ClearboxWrapperException( "Python model path was not specified in the model configuration") with open(os.path.join(model_path, python_model_subpath), "rb") as f: python_model = cloudpickle.load(f) artifacts = {} for saved_artifact_name, saved_artifact_info in pyfunc_config.get( CONFIG_KEY_ARTIFACTS, {}).items(): artifacts[saved_artifact_name] = os.path.join( model_path, saved_artifact_info[CONFIG_KEY_ARTIFACT_RELATIVE_PATH]) context = PythonModelContext(artifacts=artifacts) python_model.load_context(context=context) return _PythonModelPyfuncWrapper(python_model=python_model, context=context)
def _load_serialized_model(serialized_model_path: str, serialization_format: str) -> Any: """Load a serialized (through pickle or cloudpickle) Scikit-Learn model. Parameters ---------- serialized_model_path : str File path to the Scikit-Learn serialized model. serialization_format : str Format in which the model was serialized: SERIALIZATION_FORMAT_PICKLE or SERIALIZATION_FORMAT_CLOUDPICKLE Returns ------- Any A Scikit-Learn model. Raises ------ ClearboxWrapperException If Unrecognized serialization format. """ # TODO: we could validate the scikit-learn version here if serialization_format not in SUPPORTED_SERIALIZATION_FORMATS: raise ClearboxWrapperException( "Unrecognized serialization format: {serialization_format}. Please specify one" " of the following supported formats: {supported_formats}.".format( serialization_format=serialization_format, supported_formats=SUPPORTED_SERIALIZATION_FORMATS, )) with open(serialized_model_path, "rb") as f: # Models serialized with Cloudpickle cannot necessarily be deserialized using Pickle; if serialization_format == SERIALIZATION_FORMAT_PICKLE: return pickle.load(f) elif serialization_format == SERIALIZATION_FORMAT_CLOUDPICKLE: import cloudpickle return cloudpickle.load(f)
def predict_proba(self, data: WrapperInput, preprocess: bool = True, prepare_data: bool = True) -> WrapperOutput: if not hasattr(self._model_impl, "predict_proba"): raise ClearboxWrapperException( "This model has no predict_proba method.") if prepare_data and self._data_preparation is not None: data = self._data_preparation.prepare_data(data) elif not prepare_data: logger.warning( "This model has data preparation and you're bypassing it," " this can lead to unexpected results.") if preprocess and self._preprocessing is not None: data = self._preprocessing.preprocess(data) elif not preprocess: logger.warning( "This model has preprocessing and you're bypassing it," " this can lead to unexpected results.") return self._model_impl.predict_proba(data)
def create_and_save_data_preparation(data_preparation_function: Callable, path: str) -> None: """Create, serialize and save a DataPreparation instance. Parameters ---------- data_preparation_function : Callable A function to use as data preparation. You can use your own custom code for data preparation, but it must be wrapped in a single function. NOTE: If the data preparation includes any kind of fitting on the training dataset (e.g. Scikit Learn transformers), it must be performed outside the final data preparation function to save. Fit the transformer(s) outside the function and put only the transform method inside it. Furthermore, if the entire data preparation is performed with a single Scikit-Learn transformer, you can directly pass it (fitted) to this method. path : str Local path to save the data preparation to. Raises ------ TypeError If data_preparation_function is not a function (Callable type) ClearboxWrapperException If data preparation path already exists. """ if not isinstance(data_preparation_function, Callable): raise TypeError( "data_preparation_function should be a Callable, got '{}'".format( type(data_preparation_function))) if os.path.exists(path): raise ClearboxWrapperException( "Data preparation path '{}' already exists".format(path)) data_preparation = DataPreparation(data_preparation_function) with open(path, "wb") as data_preparation_serialized_file: cloudpickle.dump(data_preparation, data_preparation_serialized_file)
def save_sklearn_model( sk_model: Any, path: str, conda_env: Optional[Union[str, Dict]] = None, mlmodel: Optional[Model] = None, serialization_format: str = SERIALIZATION_FORMAT_CLOUDPICKLE, signature: Optional[Signature] = None, add_clearbox_flavor: bool = False, preprocessing_subpath: str = None, data_preparation_subpath: str = None, ): """Save a Scikit-Learn model. Produces an MLflow Model containing the following flavors: * wrapper.sklearn * wrapper.pyfunc. NOTE: This flavor is only included for scikit-learn models that define at least `predict()`, since `predict()` is required for pyfunc model inference. Parameters ---------- sk_model : Any A Scikit-Learn model to be saved. path : str Local path to save the model to. conda_env : Optional[Union[str, Dict]], optional A dictionary representation of a Conda environment or the path to a Conda environment YAML file, by default None. This decsribes the environment this model should be run in. If None, the default Conda environment will be added to the model. Example of a dictionary representation of a Conda environment: { 'name': 'conda-env', 'channels': ['defaults'], 'dependencies': [ 'python=3.7.0', 'scikit-learn=0.19.2' ] } serialization_format : str, optional The format in which to serialize the model. This should be one of the formats listed in SUPPORTED_SERIALIZATION_FORMATS. Cloudpickle format, SERIALIZATION_FORMAT_CLOUDPICKLE, provides better cross-system compatibility by identifying and packaging code dependencies with the serialized model, by default SERIALIZATION_FORMAT_CLOUDPICKLE signature : Optional[Signature], optional A model signature describes model input schema. It can be inferred from datasets with valid model type (e.g. the training dataset with target column omitted), by default None Raises ------ ClearboxWrapperException If unrecognized serialization format or model path already exists. """ import sklearn if serialization_format not in SUPPORTED_SERIALIZATION_FORMATS: raise ClearboxWrapperException( "Unrecognized serialization format: {serialization_format}. Please specify one" " of the following supported formats: {supported_formats}.".format( serialization_format=serialization_format, supported_formats=SUPPORTED_SERIALIZATION_FORMATS, )) if os.path.exists(path): raise ClearboxWrapperException( "Model path '{}' already exists".format(path)) os.makedirs(path) if mlmodel is None: mlmodel = Model() if signature is not None: mlmodel.signature = signature model_data_subpath = "model.pkl" _serialize_and_save_model( sk_model=sk_model, output_path=os.path.join(path, model_data_subpath), serialization_format=serialization_format, ) conda_env_subpath = "conda.yaml" if conda_env is None: conda_env = get_default_sklearn_conda_env( include_cloudpickle=serialization_format == SERIALIZATION_FORMAT_CLOUDPICKLE) elif not isinstance(conda_env, dict): with open(conda_env, "r") as f: conda_env = yaml.safe_load(f) with open(os.path.join(path, conda_env_subpath), "w") as f: yaml.safe_dump(conda_env, stream=f, default_flow_style=False) # `PyFuncModel` only works for sklearn models that define `predict()`. if hasattr(sk_model, "predict"): pyfunc.add_pyfunc_flavor_to_model( mlmodel, loader_module="clearbox_wrapper.sklearn", model_path=model_data_subpath, env=conda_env_subpath, ) if add_clearbox_flavor: add_clearbox_flavor_to_model( mlmodel, loader_module="clearbox_wrapper.sklearn", model_path=model_data_subpath, env=conda_env_subpath, preprocessing=preprocessing_subpath, data_preparation=data_preparation_subpath, ) mlmodel.add_flavor( FLAVOR_NAME, model_path=model_data_subpath, sklearn_version=sklearn.__version__, serialization_format=serialization_format, ) mlmodel.save(os.path.join(path, MLMODEL_FILE_NAME))
def save_keras_model(keras_model: Any, path: str, conda_env: Optional[Union[str, Dict]] = None, mlmodel: Optional[Model] = None, signature: Optional[Signature] = None, add_clearbox_flavor: bool = False, preprocessing_subpath: str = None, data_preparation_subpath: str = None, keras_module: str = None, custom_objects=None, **kwargs): if keras_module is None: def _is_plain_keras(model): try: import keras if LooseVersion(keras.__version__) < LooseVersion("2.2.0"): import keras.engine return isinstance(model, keras.engine.Model) else: # NB: Network is the first parent with save method import keras.engine.network return isinstance(model, keras.engine.network.Network) except ImportError: return False def _is_tf_keras(model): try: # NB: Network is not exposed in tf.keras, we check for Model instead. import tensorflow.keras.models return isinstance(model, tensorflow.keras.models.Model) except ImportError: return False if _is_plain_keras(keras_model): keras_module = importlib.import_module("keras") elif _is_tf_keras(keras_model): keras_module = importlib.import_module("tensorflow.keras") else: raise ClearboxWrapperException( "Unable to infer keras module from the model, please specify " "which keras module ('keras' or 'tensorflow.keras') is to be " "used to save and load the model.") elif type(keras_module) == str: keras_module = importlib.import_module(keras_module) if os.path.exists(path): raise ClearboxWrapperException( "Model path '{}' already exists".format(path)) data_subpath = "data" data_path = os.path.join(path, data_subpath) os.makedirs(data_path) if mlmodel is None: mlmodel = Model() if signature is not None: mlmodel.signature = signature if custom_objects is not None: _save_custom_objects(data_path, custom_objects) # save keras module spec to path/data/keras_module.txt with open(os.path.join(data_path, _KERAS_MODULE_SPEC_PATH), "w") as f: f.write(keras_module.__name__) # Use the SavedModel format if `save_format` is unspecified save_format = kwargs.get("save_format", "tf") # save keras save_format to path/data/save_format.txt with open(os.path.join(data_path, _KERAS_SAVE_FORMAT_PATH), "w") as f: f.write(save_format) # save keras model # To maintain prior behavior, when the format is HDF5, we save # with the h5 file extension. Otherwise, model_path is a directory # where the saved_model.pb will be stored (for SavedModel format) file_extension = ".h5" if save_format == "h5" else "" model_subpath = os.path.join(data_subpath, _MODEL_SAVE_PATH) model_path = os.path.join(path, model_subpath) + file_extension keras_model.save(model_path, **kwargs) conda_env_subpath = "conda.yaml" if conda_env is None: conda_env = get_default_keras_conda_env( include_cloudpickle=custom_objects is not None, keras_module=keras_module) elif not isinstance(conda_env, dict): with open(conda_env, "r") as f: conda_env = yaml.safe_load(f) with open(os.path.join(path, conda_env_subpath), "w") as f: yaml.safe_dump(conda_env, stream=f, default_flow_style=False) mlmodel.add_flavor( FLAVOR_NAME, keras_module=keras_module.__name__, keras_version=keras_module.__version__, save_format=save_format, data=data_subpath, ) pyfunc.add_pyfunc_flavor_to_model( mlmodel, loader_module="clearbox_wrapper.keras", data=data_subpath, env=conda_env_subpath, ) if add_clearbox_flavor: add_clearbox_flavor_to_model( mlmodel, loader_module="clearbox_wrapper.keras", data=data_subpath, env=conda_env_subpath, preprocessing=preprocessing_subpath, data_preparation=data_preparation_subpath, ) mlmodel.save(os.path.join(path, MLMODEL_FILE_NAME))
def predict_proba(self, dataframe): if not hasattr(self.xgb_model, "predict_proba"): raise ClearboxWrapperException("This model has no predict_proba method.") import xgboost as xgb return self.xgb_model.predict_proba(xgb.DMatrix(dataframe))
def _save_model_with_class_artifacts_params( path, python_model, artifacts=None, conda_env=None, code_paths=None, mlflow_model=new_model, ): """ :param path: The path to which to save the Python model. :param python_model: An instance of a subclass of :class:`~PythonModel`. ``python_model`` defines how the model loads artifacts and how it performs inference. :param artifacts: A dictionary containing ``<name, artifact_uri>`` entries. Remote artifact URIs are resolved to absolute filesystem paths, producing a dictionary of ``<name, absolute_path>`` entries. ``python_model`` can reference these resolved entries as the ``artifacts`` property of the ``context`` attribute. If ``None``, no artifacts are added to the model. :param conda_env: Either a dictionary representation of a Conda environment or the path to a Conda environment yaml file. If provided, this decsribes the environment this model should be run in. At minimum, it should specify the dependencies contained in :func:`get_default_conda_env()`. If ``None``, the default :func:`get_default_conda_env()` environment is added to the model. :param code_paths: A list of local filesystem paths to Python file dependencies (or directories containing file dependencies). These files are *prepended* to the system path before the model is loaded. :param mlflow_model: The model configuration to which to add the ``mlflow.pyfunc`` flavor. """ custom_model_config_kwargs = { CONFIG_KEY_CLOUDPICKLE_VERSION: cloudpickle.__version__, } if isinstance(python_model, PythonModel): saved_python_model_subpath = "python_model.pkl" with open(os.path.join(path, saved_python_model_subpath), "wb") as out: cloudpickle.dump(python_model, out) custom_model_config_kwargs[ CONFIG_KEY_PYTHON_MODEL] = saved_python_model_subpath else: raise ClearboxWrapperException( "`python_model` must be a subclass of `PythonModel`. Instead, found an" " object of type: {python_model_type}".format( python_model_type=type(python_model))) if artifacts: saved_artifacts_config = {} with TempDir() as tmp_artifacts_dir: tmp_artifacts_config = {} saved_artifacts_dir_subpath = "artifacts" for artifact_name, artifact_uri in artifacts.items(): tmp_artifact_path = _download_artifact_from_uri( artifact_uri=artifact_uri, output_path=tmp_artifacts_dir.path()) tmp_artifacts_config[artifact_name] = tmp_artifact_path saved_artifact_subpath = posixpath.join( saved_artifacts_dir_subpath, os.path.relpath(path=tmp_artifact_path, start=tmp_artifacts_dir.path()), ) saved_artifacts_config[artifact_name] = { CONFIG_KEY_ARTIFACT_RELATIVE_PATH: saved_artifact_subpath, CONFIG_KEY_ARTIFACT_URI: artifact_uri, } shutil.move( tmp_artifacts_dir.path(), os.path.join(path, saved_artifacts_dir_subpath), ) custom_model_config_kwargs[ CONFIG_KEY_ARTIFACTS] = saved_artifacts_config conda_env_subpath = "conda.yaml" if conda_env is None: conda_env = get_default_conda_env() elif not isinstance(conda_env, dict): with open(conda_env, "r") as f: conda_env = yaml.safe_load(f) with open(os.path.join(path, conda_env_subpath), "w") as f: yaml.safe_dump(conda_env, stream=f, default_flow_style=False) saved_code_subpath = None if code_paths is not None: saved_code_subpath = "code" for code_path in code_paths: _copy_file_or_tree(src=code_path, dst=path, dst_dir=saved_code_subpath) add_pyfunc_flavor_to_model(model=mlflow_model, loader_module=__name__, code=saved_code_subpath, env=conda_env_subpath, **custom_model_config_kwargs) mlflow_model.save(os.path.join(path, MLMODEL_FILE_NAME))
def preprocess_data(self, data: WrapperInput) -> WrapperOutput: if self._preprocessing is None: raise ClearboxWrapperException("This model has no preprocessing.") return self._preprocessing.preprocess(data)
def prepare_data(self, data: WrapperInput) -> WrapperOutput: if self._data_preparation is None: raise ClearboxWrapperException( "This model has no data preparation.") return self._data_preparation.prepare_data(data)
def save(self, path: str) -> None: if os.path.exists(path): raise ClearboxWrapperException( "Preprocessing path '{}' already exists".format(path)) with open(path, "wb") as preprocessing_serialized_file: cloudpickle.dump(self, preprocessing_serialized_file)
def load_model(model_path: str, suppress_warnings: bool = False) -> WrapperModel: """Load a model that has python_function flavor. Parameters ---------- model_path : str Filepath of the model directory. suppress_warnings : bool, optional If Fatal, non-fatal warning messages associated with the model loading process will be emitted, by default True Returns ------- PyFuncModel A python_function model. Raises ------ ClearboxWrapperException If the model does not have the python_function flavor. """ preprocessing = None data_preparation = None mlmodel = Model.load(os.path.join(model_path, MLMODEL_FILE_NAME)) clearbox_flavor_configuration = mlmodel.flavors.get(FLAVOR_NAME) if clearbox_flavor_configuration is None: raise ClearboxWrapperException( 'Model does not have the "{flavor_name}" flavor'.format( flavor_name=FLAVOR_NAME)) model_python_version = clearbox_flavor_configuration.get(PY_VERSION) if not suppress_warnings: _warn_potentially_incompatible_py_version_if_necessary( model_py_version=model_python_version) data_path = (os.path.join(model_path, clearbox_flavor_configuration[DATA]) if (DATA in clearbox_flavor_configuration) else model_path) model_implementation = importlib.import_module( clearbox_flavor_configuration[MAIN])._load_clearbox(data_path) if PREPROCESSING in clearbox_flavor_configuration: preprocessing_path = os.path.join( model_path, clearbox_flavor_configuration[PREPROCESSING]) preprocessing = load_serialized_preprocessing(preprocessing_path) if DATA_PREPARATION in clearbox_flavor_configuration: data_preparation_path = os.path.join( model_path, clearbox_flavor_configuration[DATA_PREPARATION]) data_preparation = load_serialized_data_preparation( data_preparation_path) loaded_model = WrapperModel( model_meta=mlmodel, model_impl=model_implementation, preprocessing=preprocessing, data_preparation=data_preparation, ) return loaded_model
def save_model( path: str, model: Any, input_data: Optional[WrapperInput] = None, preprocessing: Optional[Callable] = None, data_preparation: Optional[Callable] = None, additional_deps: Optional[List] = None, zip: bool = True, ) -> None: path_check = path + ".zip" if zip else path if os.path.exists(path_check): raise ClearboxWrapperException( "Model path '{}' already exists".format(path)) mlmodel = Model() saved_preprocessing_subpath = None saved_data_preparation_subpath = None if data_preparation is not None and preprocessing is None: raise ValueError( "Attribute 'preprocessing' is None but attribute " "'data_preparation' is not None. If you have a single step " "preprocessing, pass it as attribute 'preprocessing'") if data_preparation and preprocessing: preparation = DataPreparation(data_preparation) data_preprocessing = Preprocessing(preprocessing) saved_data_preparation_subpath = "data_preparation.pkl" saved_preprocessing_subpath = "preprocessing.pkl" if input_data is not None: if isinstance(input_data, pd.DataFrame) and input_data.shape[0] > 50: input_data = input_data.head(50) elif isinstance(input_data, np.ndarray) and input_data.shape[0] > 50: input_data = input_data[:50, :] data_preparation_output = preparation.prepare_data(input_data) preprocessing_output = data_preprocessing.preprocess( data_preparation_output) data_preparation_signature = infer_signature( input_data, data_preparation_output) preprocessing_signature = infer_signature(data_preparation_output, preprocessing_output) model_signature = infer_signature(preprocessing_output) mlmodel.preparation_signature = data_preparation_signature mlmodel.preprocessing_signature = preprocessing_signature mlmodel.model_signature = model_signature elif preprocessing: data_preprocessing = Preprocessing(preprocessing) saved_preprocessing_subpath = "preprocessing.pkl" if input_data is not None: preprocessing_output = data_preprocessing.preprocess(input_data) preprocessing_signature = infer_signature(input_data, preprocessing_output) model_signature = infer_signature(preprocessing_output) mlmodel.preprocessing_signature = preprocessing_signature mlmodel.model_signature = model_signature elif input_data is not None: model_signature = infer_signature(input_data) mlmodel.model_signature = model_signature conda_env = _check_and_get_conda_env(model, additional_deps) model_super_classes = get_super_classes_names(model) if any("sklearn" in super_class for super_class in model_super_classes): save_sklearn_model( model, path, conda_env=conda_env, mlmodel=mlmodel, add_clearbox_flavor=True, preprocessing_subpath=saved_preprocessing_subpath, data_preparation_subpath=saved_data_preparation_subpath, ) elif any("xgboost" in super_class for super_class in model_super_classes): save_xgboost_model( model, path, conda_env=conda_env, mlmodel=mlmodel, add_clearbox_flavor=True, preprocessing_subpath=saved_preprocessing_subpath, data_preparation_subpath=saved_data_preparation_subpath, ) elif any("keras" in super_class for super_class in model_super_classes): save_keras_model( model, path, conda_env=conda_env, mlmodel=mlmodel, add_clearbox_flavor=True, preprocessing_subpath=saved_preprocessing_subpath, data_preparation_subpath=saved_data_preparation_subpath, ) elif any("torch" in super_class for super_class in model_super_classes): save_pytorch_model( model, path, conda_env=conda_env, mlmodel=mlmodel, add_clearbox_flavor=True, preprocessing_subpath=saved_preprocessing_subpath, data_preparation_subpath=saved_data_preparation_subpath, ) if preprocessing: data_preprocessing.save(os.path.join(path, saved_preprocessing_subpath)) if data_preparation: preparation.save(os.path.join(path, saved_data_preparation_subpath)) if zip: zip_directory(path)
def metadata(self): """Model metadata.""" if self._model_meta is None: raise ClearboxWrapperException("Model is missing metadata.") return self._model_meta
def save_pytorch_model( pytorch_model: Any, path: str, conda_env: Optional[Union[str, Dict]] = None, mlmodel: Optional[Model] = None, signature: Optional[Signature] = None, add_clearbox_flavor: bool = False, preprocessing_subpath: str = None, data_preparation_subpath: str = None, code_paths=None, pickle_module=None, requirements_file=None, extra_files=None, **kwargs ): import torch pickle_module = pickle_module or clearbox_pytorch_pickle_module if not isinstance(pytorch_model, torch.nn.Module): raise TypeError("Argument 'pytorch_model' should be a torch.nn.Module") if code_paths is not None: if not isinstance(code_paths, list): raise TypeError( "Argument code_paths should be a list, not {}".format(type(code_paths)) ) if os.path.exists(path): raise ClearboxWrapperException("Model path '{}' already exists".format(path)) os.makedirs(path) if mlmodel is None: mlmodel = Model() if signature is not None: mlmodel.signature = signature model_data_subpath = "data" model_data_path = os.path.join(path, model_data_subpath) os.makedirs(model_data_path) # Persist the pickle module name as a file in the model's `data` directory. This is # necessary because the `data` directory is the only available parameter to # `_load_pyfunc`, and it does not contain the MLmodel configuration; therefore, # it is not sufficient to place the module name in the MLmodel # # TODO: Stop persisting this information to the filesystem once we have a mechanism for # supplying the MLmodel configuration to `mlflow.pytorch._load_pyfunc` pickle_module_path = os.path.join(model_data_path, _PICKLE_MODULE_INFO_FILE_NAME) with open(pickle_module_path, "w") as f: f.write(pickle_module.__name__) # Save pytorch model model_path = os.path.join(model_data_path, _SERIALIZED_TORCH_MODEL_FILE_NAME) if isinstance(pytorch_model, torch.jit.ScriptModule): torch.jit.ScriptModule.save(pytorch_model, model_path) else: torch.save(pytorch_model, model_path, pickle_module=pickle_module, **kwargs) torchserve_artifacts_config = {} if requirements_file: if not isinstance(requirements_file, str): raise TypeError("Path to requirements file should be a string") with TempDir() as tmp_requirements_dir: rel_path = os.path.basename(requirements_file) torchserve_artifacts_config[_REQUIREMENTS_FILE_KEY] = {"path": rel_path} shutil.move(tmp_requirements_dir.path(rel_path), path) if extra_files: torchserve_artifacts_config[_EXTRA_FILES_KEY] = [] if not isinstance(extra_files, list): raise TypeError("Extra files argument should be a list") with TempDir() as tmp_extra_files_dir: for extra_file in extra_files: rel_path = posixpath.join( _EXTRA_FILES_KEY, os.path.basename(extra_file), ) torchserve_artifacts_config[_EXTRA_FILES_KEY].append({"path": rel_path}) shutil.move( tmp_extra_files_dir.path(), posixpath.join(path, _EXTRA_FILES_KEY), ) conda_env_subpath = "conda.yaml" if conda_env is None: conda_env = get_default_pytorch_conda_env() elif not isinstance(conda_env, dict): with open(conda_env, "r") as f: conda_env = yaml.safe_load(f) with open(os.path.join(path, conda_env_subpath), "w") as f: yaml.safe_dump(conda_env, stream=f, default_flow_style=False) if code_paths is not None: code_dir_subpath = "code" for code_path in code_paths: _copy_file_or_tree(src=code_path, dst=path, dst_dir=code_dir_subpath) else: code_dir_subpath = None mlmodel.add_flavor( FLAVOR_NAME, model_data=model_data_subpath, pytorch_version=torch.__version__, **torchserve_artifacts_config, ) pyfunc.add_pyfunc_flavor_to_model( mlmodel, loader_module="clearbox_wrapper.pytorch", data=model_data_subpath, pickle_module_name=pickle_module.__name__, code=code_dir_subpath, env=conda_env_subpath, ) if add_clearbox_flavor: add_clearbox_flavor_to_model( mlmodel, loader_module="clearbox_wrapper.pytorch", data=model_data_subpath, pickle_module_name=pickle_module.__name__, code=code_dir_subpath, env=conda_env_subpath, preprocessing=preprocessing_subpath, data_preparation=data_preparation_subpath, ) mlmodel.save(os.path.join(path, MLMODEL_FILE_NAME))
def _infer_numpy_array(col: np.ndarray) -> DataType: """Infer DataType of a numpy array. Parameters ---------- col : np.ndarray Column representation as a numpy array. Returns ------- DataType Inferred datatype. Raises ------ TypeError If `col` is not a numpy array. ClearboxWrapperException If `col` is not a 1D array. """ if not isinstance(col, np.ndarray): raise TypeError("Expected numpy.ndarray, got '{}'.".format(type(col))) if len(col.shape) > 1: raise ClearboxWrapperException( "Expected 1d array, got array with shape {}".format(col.shape)) class IsInstanceOrNone(object): def __init__(self, *args): self.classes = args self.seen_instances = 0 def __call__(self, x): if x is None: return True elif any(map(lambda c: isinstance(x, c), self.classes)): self.seen_instances += 1 return True else: return False if col.dtype.kind == "O": is_binary_test = IsInstanceOrNone(bytes, bytearray) if all(map(is_binary_test, col)) and is_binary_test.seen_instances > 0: return DataType.binary is_string_test = IsInstanceOrNone(str) if all(map(is_string_test, col)) and is_string_test.seen_instances > 0: return DataType.string # NB: bool is also instance of int => boolean test must precede integer test. is_boolean_test = IsInstanceOrNone(bool) if all(map(is_boolean_test, col)) and is_boolean_test.seen_instances > 0: return DataType.boolean is_long_test = IsInstanceOrNone(int) if all(map(is_long_test, col)) and is_long_test.seen_instances > 0: return DataType.long is_double_test = IsInstanceOrNone(float) if all(map(is_double_test, col)) and is_double_test.seen_instances > 0: return DataType.double else: raise ClearboxWrapperException( "Unable to map 'np.object' type to MLflow DataType. np.object can" "be mapped iff all values have identical data type which is one " "of (string, (bytes or byterray), int, float).") else: return _infer_numpy_dtype(col.dtype)