def test_input_examples(pandas_df_with_all_types): sig = infer_signature(pandas_df_with_all_types) # test setting example with data frame with all supported data types with TempDir() as tmp: example = _Example(pandas_df_with_all_types) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("columns", "data")) parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs) assert (pandas_df_with_all_types == parsed_df).all().all() # the frame read without schema should match except for the binary values assert (parsed_df.drop(columns=["binary"]) == _dataframe_from_json(tmp.path(filename)) .drop(columns=["binary"])).all().all() # pass the input as dictionary instead with TempDir() as tmp: d = {name: pandas_df_with_all_types[name].values for name in pandas_df_with_all_types.columns} example = _Example(d) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_df = _dataframe_from_json(tmp.path(filename), sig.inputs) assert (pandas_df_with_all_types == parsed_df).all().all() # input passed as numpy array sig = infer_signature(pandas_df_with_all_types.values) with TempDir() as tmp: example = _Example(pandas_df_with_all_types.values) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("data",)) parsed_ary = _dataframe_from_json(tmp.path(filename), schema=sig.inputs).values assert (pandas_df_with_all_types.values == parsed_ary).all().all() # pass multidimensional array with TempDir() as tmp: example = np.array([[[1, 2, 3]]]) with pytest.raises(TensorsNotSupportedException): _Example(example) # pass multidimensional array with TempDir() as tmp: example = np.array([[1, 2, 3]]) with pytest.raises(TensorsNotSupportedException): _Example({"x": example, "y": example}) # pass dict with scalars with TempDir() as tmp: example = {"a": 1, "b": "abc"} x = _Example(example) x.save(tmp.path()) filename = x.info["artifact_path"] parsed_df = _dataframe_from_json(tmp.path(filename)) assert example == parsed_df.to_dict(orient="records")[0]
def test_sparse_matrix_input_examples(dict_of_sparse_matrix): for example_type, input_example in dict_of_sparse_matrix.items(): with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_matrix = _read_sparse_matrix_from_json(tmp.path(filename), example_type) assert np.array_equal(parsed_matrix.toarray(), input_example.toarray())
def test_input_examples_with_nan(df_with_nan, dict_of_ndarrays_with_nans): # test setting example with data frame with NaN values in it sig = infer_signature(df_with_nan) with TempDir() as tmp: example = _Example(df_with_nan) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("columns", "data")) parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs) # by definition of NaN, NaN == NaN is False but NaN != NaN is True assert ( ((df_with_nan == parsed_df) | ((df_with_nan != df_with_nan) & (parsed_df != parsed_df))) .all() .all() ) # the frame read without schema should match except for the binary values no_schema_df = _dataframe_from_json(tmp.path(filename)) a = parsed_df.drop(columns=["binary"]) b = no_schema_df.drop(columns=["binary"]) assert ((a == b) | ((a != a) & (b != b))).all().all() # pass multidimensional array for col in dict_of_ndarrays_with_nans: input_example = dict_of_ndarrays_with_nans[col] sig = infer_signature(input_example) with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_ary = _read_tensor_input_from_json(tmp.path(filename), schema=sig.inputs) assert np.array_equal(parsed_ary, input_example, equal_nan=True) # without a schema/dtype specified, the resulting tensor will keep the None type no_schema_df = _read_tensor_input_from_json(tmp.path(filename)) assert np.array_equal( no_schema_df, np.where(np.isnan(input_example), None, input_example) )
def _save_model_metadata(dst_dir, spark_model, mlflow_model, sample_input, conda_env, signature=None, input_example=None): """ Saves model metadata into the passed-in directory. The persisted metadata assumes that a model can be loaded from a relative path to the metadata file (currently hard-coded to "sparkml"). """ import pyspark if sample_input is not None: mleap.add_to_model(mlflow_model=mlflow_model, path=dst_dir, spark_model=spark_model, sample_input=sample_input) if input_example is not None: example = _Example(input_example) example.save(dst_dir) mlflow_model.example_input = example.info if signature is not None: mlflow_model.signature = signature conda_env_subpath = "conda.yaml" if conda_env is None: conda_env = get_default_conda_env() elif not isinstance(conda_env, dict): with open(conda_env, "r") as f: conda_env = yaml.safe_load(f) with open(os.path.join(dst_dir, conda_env_subpath), "w") as f: yaml.safe_dump(conda_env, stream=f, default_flow_style=False) mlflow_model.add_flavor(FLAVOR_NAME, pyspark_version=pyspark.__version__, model_data=_SPARK_MODEL_PATH_SUB) pyfunc.add_to_model(mlflow_model, loader_module="mlflow.spark", data=_SPARK_MODEL_PATH_SUB, env=conda_env_subpath) mlflow_model.save(os.path.join(dst_dir, "MLmodel"))
def log(cls, artifact_path, flavor, registered_model_name=None, signature: ModelSignature = None, input_example: ModelInputExample = None, **kwargs): """ Log model using supplied flavor module. If no run is active, this method will create a new active run. :param artifact_path: Run relative path identifying the model. :param flavor: Flavor module to save the model with. The module must have the ``save_model`` function that will persist the model as a valid MLflow model. :param registered_model_name: (Experimental) If given, create a model version under ``registered_model_name``, also creating a registered model if one with the given name does not exist. :param signature: (Experimental) :py:class:`ModelSignature` describes model input and output :py:class:`Schema <mlflow.types.Schema>`. The model signature can be :py:func:`inferred <infer_signature>` from datasets representing valid model input (e.g. the training dataset) and valid model output (e.g. model predictions generated on the training dataset), for example: .. code-block:: python from mlflow.models.signature import infer_signature train = df.drop_column("target_label") signature = infer_signature(train, model.predict(train)) :param input_example: (Experimental) Input example provides one or several examples of valid model input. The example can be used as a hint of what data to feed the model. The given example will be converted to a Pandas DataFrame and then serialized to json using the Pandas split-oriented format. Bytes are base64-encoded. :param kwargs: Extra args passed to the model flavor. """ with TempDir() as tmp: local_path = tmp.path("model") run_id = mlflow.tracking.fluent._get_or_start_run().info.run_id mlflow_model = cls(artifact_path=artifact_path, run_id=run_id) if signature is not None: mlflow_model.signature = signature if input_example is not None: input_example = _Example(input_example) mlflow_model.input_example = input_example.info flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs) if input_example is not None: input_example.save(local_path) mlflow.tracking.fluent.log_artifacts(local_path, artifact_path) try: mlflow.tracking.fluent._record_logged_model(mlflow_model) except MlflowException: # We need to swallow all mlflow exceptions to maintain backwards compatibility with # older tracking servers. Only print out a warning for now. _logger.warning( "Logging model metadata to the tracking server has failed, possibly due older " "server version. The model artifacts have been logged successfully under %s. " "In addition to exporting model artifacts, MLflow clients 1.7.0 and above " "attempt to record model metadata to the tracking store. If logging to a " "mlflow server via REST, consider upgrading the server version to MLflow " "1.7.0 or above.", mlflow.get_artifact_uri()) if registered_model_name is not None: run_id = mlflow.tracking.fluent.active_run().info.run_id mlflow.register_model("runs:/%s/%s" % (run_id, artifact_path), registered_model_name)
def test_input_examples(pandas_df_with_all_types, dict_of_ndarrays): sig = infer_signature(pandas_df_with_all_types) # test setting example with data frame with all supported data types with TempDir() as tmp: example = _Example(pandas_df_with_all_types) example.save(tmp.path()) filename = example.info["artifact_path"] with open(tmp.path(filename), "r") as f: data = json.load(f) assert set(data.keys()) == set(("columns", "data")) parsed_df = _dataframe_from_json(tmp.path(filename), schema=sig.inputs) assert (pandas_df_with_all_types == parsed_df).all().all() # the frame read without schema should match except for the binary values assert ((parsed_df.drop(columns=["binary"]) == _dataframe_from_json( tmp.path(filename)).drop(columns=["binary"])).all().all()) # NB: Drop columns that cannot be encoded by proto_json_utils.pyNumpyEncoder new_df = pandas_df_with_all_types.drop( columns=["boolean_ext", "integer_ext", "string_ext"]) # pass the input as dictionary instead with TempDir() as tmp: d = {name: new_df[name].values for name in new_df.columns} example = _Example(d) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_dict = _read_tensor_input_from_json(tmp.path(filename)) assert d.keys() == parsed_dict.keys() # Asserting binary will fail since it is converted to base64 encoded strings. # The check above suffices that the binary input is stored. del d["binary"] for key in d: assert np.array_equal(d[key], parsed_dict[key]) # input passed as numpy array new_df = pandas_df_with_all_types.drop(columns=["binary"]) for col in new_df: input_example = new_df[col].to_numpy() with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_ary = _read_tensor_input_from_json(tmp.path(filename)) assert np.array_equal(parsed_ary, input_example) # pass multidimensional array for col in dict_of_ndarrays: input_example = dict_of_ndarrays[col] with TempDir() as tmp: example = _Example(input_example) example.save(tmp.path()) filename = example.info["artifact_path"] parsed_ary = _read_tensor_input_from_json(tmp.path(filename)) assert np.array_equal(parsed_ary, input_example) # pass multidimensional array as a list example = np.array([[1, 2, 3]]) with pytest.raises(TensorsNotSupportedException): _Example([example, example]) # pass dict with scalars with TempDir() as tmp: example = {"a": 1, "b": "abc"} x = _Example(example) x.save(tmp.path()) filename = x.info["artifact_path"] parsed_df = _dataframe_from_json(tmp.path(filename)) assert example == parsed_df.to_dict(orient="records")[0]