def test_spark_udf_autofills_no_arguments(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]), outputs=Schema([ColSpec("integer")]), ) good_data = spark.createDataFrame( pd.DataFrame(columns=["a", "b", "c", "d"], data={"a": [1], "b": [2], "c": [3], "d": [4]}) ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) res = good_data.withColumn("res", udf()).select("res").toPandas() assert res["res"][0] == ["a", "b", "c"] with pytest.raises( pyspark.sql.utils.PythonException, match=r"Model input is missing columns. Expected 3 input columns", ): res = good_data.withColumn("res", udf("b", "c")).select("res").toPandas() # this dataframe won't work because it's missing column a bad_data = spark.createDataFrame( pd.DataFrame( columns=["x", "b", "c", "d"], data={"x": [1], "b": [2], "c": [3], "d": [4]} ) ) with pytest.raises(AnalysisException, match=r"cannot resolve 'a' given input columns"): bad_data.withColumn("res", udf()) nameless_signature = ModelSignature( inputs=Schema([ColSpec("long"), ColSpec("long"), ColSpec("long")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=nameless_signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) with pytest.raises( MlflowException, match=r"Cannot apply udf because no column names specified", ): good_data.withColumn("res", udf()) with mlflow.start_run() as run: # model without signature mlflow.pyfunc.log_model("model", python_model=TestModel()) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) with pytest.raises(MlflowException, match="Attempting to apply udf on zero columns"): res = good_data.withColumn("res", udf()).select("res").toPandas()
def test_spark_udf_autofills_column_names_with_schema(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) data = spark.createDataFrame( pd.DataFrame( columns=["a", "b", "c", "d"], data={"a": [1], "b": [2], "c": [3], "d": [4]} ) ) with pytest.raises(pyspark.sql.utils.PythonException): res = data.withColumn("res1", udf("a", "b")).select("res1").toPandas() res = data.withColumn("res2", udf("a", "b", "c")).select("res2").toPandas() assert res["res2"][0] == ["a", "b", "c"] res = data.withColumn("res4", udf("a", "b", "c", "d")).select("res4").toPandas() assert res["res4"][0] == ["a", "b", "c"]
def test_serving_model_with_schema(pandas_df_with_all_types): class TestModel(PythonModel): def predict(self, context, model_input): return [[k, str(v)] for k, v in model_input.dtypes.items()] schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns]) df = _shuffle_pdf(pandas_df_with_all_types) with TempDir(chdr=True): with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=ModelSignature(schema)) response = pyfunc_serve_and_score_model( model_uri="runs:/{}/model".format(run.info.run_id), data=json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder), content_type=pyfunc_scoring_server. CONTENT_TYPE_JSON_SPLIT_ORIENTED, extra_args=["--no-conda"], ) response_json = json.loads(response.content) assert response_json == [ [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items() ] response = pyfunc_serve_and_score_model( model_uri="runs:/{}/model".format(run.info.run_id), data=json.dumps(pandas_df_with_all_types.to_dict(orient="records"), cls=NumpyEncoder), content_type=pyfunc_scoring_server. CONTENT_TYPE_JSON_RECORDS_ORIENTED, extra_args=["--no-conda"], ) response_json = json.loads(response.content) assert response_json == [ [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items() ]
def test_missing_value_hint_is_displayed_when_it_should(): m = Model() input_schema = Schema([ColSpec("integer", "a")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[1], [None]], columns=["a"], ) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) hint = "Hint: the type mismatch is likely caused by missing values." assert "Incompatible input types" in str(ex.value.message) assert hint in str(ex.value.message) pdf = pd.DataFrame( data=[[1.5], [None]], columns=["a"], ) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) assert hint not in str(ex.value.message) pdf = pd.DataFrame(data=[[1], [2]], columns=["a"], dtype=np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex.value.message) assert hint not in str(ex.value.message)
def test_schema_enforcement_single_named_tensor_schema(): m = Model() input_schema = Schema([TensorSpec(np.dtype(np.uint64), (-1, 2), "a")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) inp = { "a": np.array([[0, 0], [1, 1]], dtype=np.uint64), } # sanity test that dictionary with correct input works res = pyfunc_model.predict(inp) assert res == inp expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test single np.ndarray input works and is converted to dictionary res = pyfunc_model.predict(inp["a"]) assert res == inp expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test list does not work with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[0, 0], [1, 1]]) assert "Model is missing inputs ['a']" in str(ex)
def test_schema_enforcement_named_tensor_schema_1d(): m = Model() input_schema = Schema([ TensorSpec(np.dtype(np.uint64), (-1, ), "a"), TensorSpec(np.dtype(np.float32), (-1, ), "b") ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) pdf["a"] = pdf["a"].astype(np.uint64) pdf["b"] = pdf["a"].astype(np.float32) d_inp = { "a": np.array(pdf["a"], dtype=np.uint64), "b": np.array(pdf["b"], dtype=np.float32), } # test dataframe input works for 1d tensor specs and input is converted to dict res = pyfunc_model.predict(pdf) assert _compare_exact_tensor_dict_input(res, d_inp) expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test that dictionary works too res = pyfunc_model.predict(d_inp) assert res == d_inp expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types
def on_train_end(self, args, state, control, **kwargs): input_schema = Schema([ColSpec(name="text", type="string")]) output_schema = Schema([TensorSpec(np.dtype(np.float), (-1, -1))]) signature = ModelSignature(inputs=input_schema, outputs=output_schema) pyfunc.log_model( # artifact path is _relative_ to run root in mlflow artifact_path="bert_classifier_model", # Dir with the module files for dependencies code_path=[ os.path.join(os.path.dirname(os.path.abspath(__file__)), "models.py"), os.path.join(os.path.dirname(os.path.abspath(__file__)), "utils.py") ], python_model=MLFlowBertClassificationModel(), artifacts={ "model": state.best_model_checkpoint, }, conda_env={ 'name': 'classifier-env', 'channels': ['defaults', 'pytorch', 'pypi'], 'dependencies': [ 'python=3.8.8', 'pip', 'pytorch=1.8.0', { 'pip': [ 'transformers==4.4.2', 'mlflow==1.15.0', 'numpy==1.20.1' ] } ] }, signature=signature, await_registration_for=5, registered_model_name=self.registered_name)
def test_tensor_schema_enforcement_no_col_names(): m = Model() input_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 3))]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) test_data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32) # Can call with numpy array of correct shape assert np.array_equal(pyfunc_model.predict(test_data), test_data) # Or can call with a dataframe assert np.array_equal(pyfunc_model.predict(pd.DataFrame(test_data)), test_data) # Can not call with a list with pytest.raises( MlflowException, match= "This model contains a tensor-based model signature with no input names", ): pyfunc_model.predict([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) # Can not call with a dict with pytest.raises( MlflowException, match= "This model contains a tensor-based model signature with no input names", ): pyfunc_model.predict({"blah": test_data}) # Can not call with a np.ndarray of a wrong shape with pytest.raises( MlflowException, match=re.escape( "Shape of input (2, 2) does not match expected shape (-1, 3)"), ): pyfunc_model.predict(np.array([[1.0, 2.0], [4.0, 5.0]])) # Can not call with a np.ndarray of a wrong type with pytest.raises( MlflowException, match="dtype of input uint32 does not match expected dtype float32" ): pyfunc_model.predict(test_data.astype(np.uint32)) # Can call with a np.ndarray with more elements along variable axis test_data2 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], dtype=np.float32) assert np.array_equal(pyfunc_model.predict(test_data2), test_data2) # Can not call with an empty ndarray with pytest.raises( MlflowException, match=re.escape( "Shape of input () does not match expected shape (-1, 3)")): pyfunc_model.predict(np.ndarray([]))
def test_schema_enforcement_no_col_names(): class TestModel(object): @staticmethod def predict(pdf): return pdf m = Model() input_schema = Schema( [ColSpec("double"), ColSpec("double"), ColSpec("double")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) test_data = [[1.0, 2.0, 3.0]] # Can call with just a list assert pyfunc_model.predict(test_data).equals(pd.DataFrame(test_data)) # Or can call with a DataFrame without column names assert pyfunc_model.predict(pd.DataFrame(test_data)).equals( pd.DataFrame(test_data)) # # Or can call with a np.ndarray assert pyfunc_model.predict(pd.DataFrame(test_data).values).equals( pd.DataFrame(test_data)) # Or with column names! pdf = pd.DataFrame(data=test_data, columns=["a", "b", "c"]) assert pyfunc_model.predict(pdf).equals(pdf) # Must provide the right number of arguments with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[1.0, 2.0]]) assert "the provided input only has 2 columns." in str(ex) # Must provide the right types with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[1, 2, 3]]) assert "Can not safely convert int64 to float64" in str(ex) # Can only provide data type that can be converted to dataframe... with pytest.raises(MlflowException) as ex: pyfunc_model.predict(set([1, 2, 3])) assert "Expected input to be DataFrame or list. Found: set" in str(ex) # 9. dictionaries of str -> list/nparray work d = {"a": [1.0], "b": [2.0], "c": [3.0]} assert pyfunc_model.predict(d).equals(pd.DataFrame(d))
def test_spark_udf_with_datetime_columns(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("datetime", "timestamp"), ColSpec("datetime", "date")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) data = spark.range(10).selectExpr( "current_timestamp() as timestamp", "current_date() as date" ) res = data.withColumn("res", udf("timestamp", "date")).select("res") res = res.toPandas() assert res["res"][0] == ["timestamp", "date"]
def test_schema_enforcement(): class TestModel(object): @staticmethod def predict(pdf): return pdf m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[1, 2, 3, 4, True, "x", bytes([1])]], columns=["b", "d", "a", "c", "e", "g", "f"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f"]]) assert "Model input is missing columns" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) expected_types = dict( zip(input_schema.column_names(), input_schema.pandas_types())) actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 4. float -> double works pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.int64) # 5. floats -> ints raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 6. ints -> floats raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 7. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types
def test_tensor_multi_named_schema_enforcement(): m = Model() input_schema = Schema([ TensorSpec(np.dtype(np.uint64), (-1, 5), "a"), TensorSpec(np.dtype(np.short), (-1, 2), "b"), TensorSpec(np.dtype(np.float32), (2, -1, 2), "c"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) inp = { "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]], dtype=np.uint64), "b": np.array([[0, 0], [1, 1], [2, 2]], dtype=np.short), "c": np.array([[[0, 0], [1, 1]], [[2, 2], [3, 3]]], dtype=np.float32), } # test that missing column raises inp1 = {k: v for k, v in inp.items()} with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp1.pop("b")) assert "Model is missing inputs" in str(ex) # test that extra column is ignored inp2 = {k: v for k, v in inp.items()} inp2["x"] = 1 # test that extra column is removed res = pyfunc_model.predict(inp2) assert res == {k: v for k, v in inp.items() if k in {"a", "b", "c"}} expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test that variable axes are supported inp3 = { "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2]], dtype=np.uint64), "b": np.array([[0, 0], [1, 1]], dtype=np.short), "c": np.array([[[0, 0]], [[2, 2]]], dtype=np.float32), } res = pyfunc_model.predict(inp3) assert _compare_exact_tensor_dict_input(res, inp3) expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test that type casting is not supported inp4 = {k: v for k, v in inp.items()} inp4["a"] = inp4["a"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp4) assert "dtype of input int32 does not match expected dtype uint64" in str( ex) # test wrong shape inp5 = { "a": np.array([[0, 0, 0, 0]], dtype=np.uint), "b": np.array([[0, 0], [1, 1]], dtype=np.short), "c": np.array([[[0, 0]]], dtype=np.float32), } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp5) assert "Shape of input (1, 4) does not match expected shape (-1, 5)" in str( ex) # test non-dictionary input inp6 = [ np.array([[0, 0, 0, 0, 0]], dtype=np.uint64), np.array([[0, 0], [1, 1]], dtype=np.short), np.array([[[0, 0]]], dtype=np.float32), ] with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp6) assert "Model is missing inputs ['a', 'b', 'c']." in str(ex) # test empty ndarray does not work inp7 = {k: v for k, v in inp.items()} inp7["a"] = np.array([]) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp7) assert "Shape of input (0,) does not match expected shape" in str(ex) # test dictionary of str -> list does not work inp8 = {k: list(v) for k, v in inp.items()} with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp8) assert "This model contains a tensor-based model signature with input names" in str( ex) assert ( "suggests a dictionary input mapping input name to a numpy array, but a dict" " with value type <class 'list'> was found") in str(ex) # test dataframe input fails at shape enforcement pdf = pd.DataFrame( data=[[1, 2, 3]], columns=["a", "b", "c"], ) pdf["a"] = pdf["a"].astype(np.uint64) pdf["b"] = pdf["b"].astype(np.short) pdf["c"] = pdf["c"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Shape of input (1,) does not match expected shape (-1, 5)" in str( ex)
def test_column_schema_enforcement(): m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ColSpec("datetime", "h"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[ 1, 2, 3, 4, True, "x", bytes([1]), "2021-01-01 00:00:00.1234567" ]], columns=["b", "d", "a", "c", "e", "g", "f", "h"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) pdf["h"] = pdf["h"].astype(np.datetime64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f", "h"]]) assert "Model is missing inputs" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) expected_types = dict( zip(input_schema.input_names(), input_schema.pandas_types())) # MLflow datetime type in input_schema does not encode precision, so add it for assertions expected_types["h"] = np.dtype("datetime64[ns]") actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. unsigned int -> long works pdf["b"] = pdf["b"].astype(np.uint32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 4. unsigned int -> int raises pdf["a"] = pdf["a"].astype(np.uint32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 5. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 6. float -> double works, double -> float does not pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 7. int -> float raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 8. int -> double works pdf["d"] = pdf["d"].astype(np.int32) pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types # 9. long -> double raises pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["b"] = pdf["b"].astype(np.int64) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 11. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types # 12. datetime64[D] (date only) -> datetime64[x] works pdf["h"] = pdf["h"].astype("datetime64[D]") res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types pdf["h"] = pdf["h"].astype("datetime64[s]") # 13. np.ndarrays can be converted to dataframe but have no columns with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf.values) assert "Model is missing inputs" in str(ex) # 14. dictionaries of str -> list/nparray work arr = np.array([1, 2, 3]) d = { "a": arr.astype("int32"), "b": arr.astype("int64"), "c": arr.astype("float32"), "d": arr.astype("float64"), "e": [True, False, True], "g": ["a", "b", "c"], "f": [bytes(0), bytes(1), bytes(1)], "h": np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64), } res = pyfunc_model.predict(d) assert res.dtypes.to_dict() == expected_types # 15. dictionaries of str -> list[list] fail d = { "a": [arr.astype("int32")], "b": [arr.astype("int64")], "c": [arr.astype("float32")], "d": [arr.astype("float64")], "e": [[True, False, True]], "g": [["a", "b", "c"]], "f": [[bytes(0), bytes(1), bytes(1)]], "h": [ np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64) ], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "Incompatible input types" in str(ex) # 16. conversion to dataframe fails d = { "a": [1], "b": [1, 2], "c": [1, 2, 3], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "This model contains a column-based signature, which suggests a DataFrame input." in str( ex)
except Exception as e: logger.exception("Could not read CSV file: {}".format(e)) exit(1) data.dropna() data = data.drop(["step", "customer", "zipcodeOri", "merchant", "zipMerchant"], axis="columns") input_schema = Schema([ ColSpec("string", "age"), ColSpec("string", "gender"), ColSpec("string", "category"), ColSpec("double", "amount") ]) output_schema = Schema([ColSpec("integer")]) signature = ModelSignature(inputs=input_schema, outputs=output_schema) # Prepare train and test sets data_x = data.drop(["fraud"], axis="columns") data_y = data[["fraud"]] train_x, test_x, train_y, test_y = train_test_split(data_x, data_y) with mlflow.start_run(): # Define pipeline numeric_features = ['amount'] numeric_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]) categorical_features = ['age', 'gender', 'category']
early_stopping=True) s = self.tokenizer.decode(s[0], skip_special_tokens=True) return [s] def predict(self, context, model_input): model_input[['name']] = model_input.apply(self.summarize_article) return model_input # Input and Output formats input = json.dumps([{'name': 'text', 'type': 'string'}]) output = json.dumps([{'name': 'text', 'type': 'string'}]) # Load model from spec signature = ModelSignature.from_dict({'inputs': input, 'outputs': output}) #MLFlow Operations mlflow.set_tracking_uri("") tracking_uri = mlflow.get_tracking_uri() print("Current tracking uri: {}".format(tracking_uri)) # Start tracking with mlflow.start_run(run_name="hf_summarizer") as run: print(run.info.run_id) runner = run.info.run_id print("mlflow models serve -m runs:/" + run.info.run_id + "/model --no-conda") mlflow.pyfunc.log_model('model', loader_module=None, data_path=None,