def test_serving_model_with_schema(pandas_df_with_all_types): class TestModel(PythonModel): def predict(self, context, model_input): return [[k, str(v)] for k, v in model_input.dtypes.items()] schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns]) df = _shuffle_pdf(pandas_df_with_all_types) with TempDir(chdr=True): with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=ModelSignature(schema)) response = pyfunc_serve_and_score_model( model_uri="runs:/{}/model".format(run.info.run_id), data=json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder), content_type=pyfunc_scoring_server. CONTENT_TYPE_JSON_SPLIT_ORIENTED, extra_args=["--no-conda"], ) response_json = json.loads(response.content) assert response_json == [ [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items() ] response = pyfunc_serve_and_score_model( model_uri="runs:/{}/model".format(run.info.run_id), data=json.dumps(pandas_df_with_all_types.to_dict(orient="records"), cls=NumpyEncoder), content_type=pyfunc_scoring_server. CONTENT_TYPE_JSON_RECORDS_ORIENTED, extra_args=["--no-conda"], ) response_json = json.loads(response.content) assert response_json == [ [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items() ]
def test_schema_enforcement_named_tensor_schema_1d(): m = Model() input_schema = Schema([ TensorSpec(np.dtype(np.uint64), (-1, ), "a"), TensorSpec(np.dtype(np.float32), (-1, ), "b") ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"]) pdf["a"] = pdf["a"].astype(np.uint64) pdf["b"] = pdf["a"].astype(np.float32) d_inp = { "a": np.array(pdf["a"], dtype=np.uint64), "b": np.array(pdf["b"], dtype=np.float32), } # test dataframe input works for 1d tensor specs and input is converted to dict res = pyfunc_model.predict(pdf) assert _compare_exact_tensor_dict_input(res, d_inp) expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test that dictionary works too res = pyfunc_model.predict(d_inp) assert res == d_inp expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types
def test_missing_value_hint_is_displayed_when_it_should(): m = Model() input_schema = Schema([ColSpec("integer", "a")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[1], [None]], columns=["a"], ) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) hint = "Hint: the type mismatch is likely caused by missing values." assert "Incompatible input types" in str(ex.value.message) assert hint in str(ex.value.message) pdf = pd.DataFrame( data=[[1.5], [None]], columns=["a"], ) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) assert hint not in str(ex.value.message) pdf = pd.DataFrame(data=[[1], [2]], columns=["a"], dtype=np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex.value.message) assert hint not in str(ex.value.message)
def test_schema_enforcement_single_named_tensor_schema(): m = Model() input_schema = Schema([TensorSpec(np.dtype(np.uint64), (-1, 2), "a")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) inp = { "a": np.array([[0, 0], [1, 1]], dtype=np.uint64), } # sanity test that dictionary with correct input works res = pyfunc_model.predict(inp) assert res == inp expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test single np.ndarray input works and is converted to dictionary res = pyfunc_model.predict(inp["a"]) assert res == inp expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test list does not work with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[0, 0], [1, 1]]) assert "Model is missing inputs ['a']" in str(ex)
def test_parse_with_schema(pandas_df_with_all_types): schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns]) df = _shuffle_pdf(pandas_df_with_all_types) json_str = json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder) df = pyfunc_scoring_server.parse_json_input(json_str, orient="split", schema=schema) json_str = json.dumps(df.to_dict(orient="records"), cls=NumpyEncoder) df = pyfunc_scoring_server.parse_json_input(json_str, orient="records", schema=schema) assert schema == infer_signature(df[schema.column_names()]).inputs # The current behavior with pandas json parse with type hints is weird. In some cases, the # types are forced ignoting overflow and loss of precision: bad_df = """{ "columns":["bad_integer", "bad_float", "bad_string", "bad_boolean"], "data":[ [9007199254740991.0, 1.1, 1, 1.5], [9007199254740992.0, 9007199254740992.0, 2, 0], [9007199254740994.0, 3.3, 3, "some arbitrary string"] ] }""" schema = Schema( [ ColSpec("integer", "bad_integer"), ColSpec("float", "bad_float"), ColSpec("float", "good_float"), ColSpec("string", "bad_string"), ColSpec("boolean", "bad_boolean"), ] ) df = pyfunc_scoring_server.parse_json_input(bad_df, orient="split", schema=schema) # Unfortunately, the current behavior of pandas parse is to force numbers to int32 even if # they don't fit: assert df["bad_integer"].dtype == np.int32 assert all(df["bad_integer"] == [-2147483648, -2147483648, -2147483648]) # The same goes for floats: assert df["bad_float"].dtype == np.float32 assert all(df["bad_float"] == np.array([1.1, 9007199254740992, 3.3], dtype=np.float32)) # However bad string is recognized as int64: assert all(df["bad_string"] == np.array([1, 2, 3], dtype=np.object)) # Boolean is forced - zero and empty string is false, everything else is true: assert df["bad_boolean"].dtype == np.bool assert all(df["bad_boolean"] == [True, False, True])
def test_tensor_schema_enforcement_no_col_names(): m = Model() input_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 3))]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) test_data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32) # Can call with numpy array of correct shape assert np.array_equal(pyfunc_model.predict(test_data), test_data) # Or can call with a dataframe assert np.array_equal(pyfunc_model.predict(pd.DataFrame(test_data)), test_data) # Can not call with a list with pytest.raises( MlflowException, match= "This model contains a tensor-based model signature with no input names", ): pyfunc_model.predict([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) # Can not call with a dict with pytest.raises( MlflowException, match= "This model contains a tensor-based model signature with no input names", ): pyfunc_model.predict({"blah": test_data}) # Can not call with a np.ndarray of a wrong shape with pytest.raises( MlflowException, match=re.escape( "Shape of input (2, 2) does not match expected shape (-1, 3)"), ): pyfunc_model.predict(np.array([[1.0, 2.0], [4.0, 5.0]])) # Can not call with a np.ndarray of a wrong type with pytest.raises( MlflowException, match="dtype of input uint32 does not match expected dtype float32" ): pyfunc_model.predict(test_data.astype(np.uint32)) # Can call with a np.ndarray with more elements along variable axis test_data2 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], dtype=np.float32) assert np.array_equal(pyfunc_model.predict(test_data2), test_data2) # Can not call with an empty ndarray with pytest.raises( MlflowException, match=re.escape( "Shape of input () does not match expected shape (-1, 3)")): pyfunc_model.predict(np.ndarray([]))
def test_parse_tf_serving_dictionary(): def assert_result(result, expected_result): assert result.keys() == expected_result.keys() for key in result: assert (result[key] == expected_result[key]).all() # instances are correctly aggregated to dict of input name -> tensor tfserving_input = { "instances": [ {"a": "s1", "b": 1.1, "c": [1, 2, 3]}, {"a": "s2", "b": 2.2, "c": [4, 5, 6]}, {"a": "s3", "b": 3.3, "c": [7, 8, 9]}, ] } # Without Schema result = parse_tf_serving_input(tfserving_input) expected_result_no_schema = { "a": np.array(["s1", "s2", "s3"]), "b": np.array([1.1, 2.2, 3.3], dtype="float64"), "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype="int64"), } assert_result(result, expected_result_no_schema) # With schema schema = Schema( [ TensorSpec(np.dtype("object"), [-1], "a"), TensorSpec(np.dtype("float32"), [-1], "b"), TensorSpec(np.dtype("int32"), [-1], "c"), ] ) result = parse_tf_serving_input(tfserving_input, schema) expected_result_schema = { "a": np.array(["s1", "s2", "s3"]), "b": np.array([1.1, 2.2, 3.3], dtype="float32"), "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype="int32"), } assert_result(result, expected_result_schema) # input provided as a dict tfserving_input = { "inputs": { "a": ["s1", "s2", "s3"], "b": [1.1, 2.2, 3.3], "c": [[1, 2, 3], [4, 5, 6], [7, 8, 9]], } } # Without Schema result = parse_tf_serving_input(tfserving_input) assert_result(result, expected_result_no_schema) # With Schema result = parse_tf_serving_input(tfserving_input, schema) assert_result(result, expected_result_schema)
def test_spark_udf_with_datetime_columns(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema([ColSpec("datetime", "timestamp"), ColSpec("datetime", "date")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf( spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType()) ) data = spark.range(10).selectExpr( "current_timestamp() as timestamp", "current_date() as date" ) res = data.withColumn("res", udf("timestamp", "date")).select("res") res = res.toPandas() assert res["res"][0] == ["timestamp", "date"]
def test_parse_tf_serving_single_array(): def assert_result(result, expected_result): assert (result == expected_result).all() # values for each column are properly converted to a tensor arr = [ [[1, 2, 3], [4, 5, 6], [7, 8, 9]], [[3, 2, 1], [6, 5, 4], [9, 8, 7]], ] tfserving_instances = {"instances": arr} tfserving_inputs = {"inputs": arr} # Without schema instance_result = parse_tf_serving_input(tfserving_instances) assert instance_result.shape == (2, 3, 3) assert_result(instance_result, np.array(arr, dtype="int64")) input_result = parse_tf_serving_input(tfserving_inputs) assert input_result.shape == (2, 3, 3) assert_result(input_result, np.array(arr, dtype="int64")) # Unnamed schema schema = Schema([TensorSpec(np.dtype("float32"), [-1])]) instance_result = parse_tf_serving_input(tfserving_instances, schema) assert_result(instance_result, np.array(arr, dtype="float32")) input_result = parse_tf_serving_input(tfserving_inputs, schema) assert_result(input_result, np.array(arr, dtype="float32")) # named schema schema = Schema([TensorSpec(np.dtype("float32"), [-1], "a")]) instance_result = parse_tf_serving_input(tfserving_instances, schema) assert isinstance(instance_result, dict) assert len(instance_result.keys()) == 1 and "a" in instance_result assert_result(instance_result["a"], np.array(arr, dtype="float32")) input_result = parse_tf_serving_input(tfserving_inputs, schema) assert isinstance(input_result, dict) assert len(input_result.keys()) == 1 and "a" in input_result assert_result(input_result["a"], np.array(arr, dtype="float32"))
def test_spark_udf_autofills_column_names_with_schema(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema( [ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf(spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())) data = spark.createDataFrame( pd.DataFrame(columns=["a", "b", "c", "d"], data={ "a": [1], "b": [2], "c": [3], "d": [4] })) with pytest.raises(Py4JJavaError): res = data.withColumn("res1", udf("a", "b")).select("res1").toPandas() res = data.withColumn("res2", udf("a", "b", "c")).select("res2").toPandas() assert res["res2"][0] == ["a", "b", "c"] res = data.withColumn("res4", udf("a", "b", "c", "d")).select("res4").toPandas() assert res["res4"][0] == ["a", "b", "c"]
def test_schema_enforcement_no_col_names(): class TestModel(object): @staticmethod def predict(pdf): return pdf m = Model() input_schema = Schema( [ColSpec("double"), ColSpec("double"), ColSpec("double")]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) test_data = [[1.0, 2.0, 3.0]] # Can call with just a list assert pyfunc_model.predict(test_data).equals(pd.DataFrame(test_data)) # Or can call with a DataFrame without column names assert pyfunc_model.predict(pd.DataFrame(test_data)).equals( pd.DataFrame(test_data)) # # Or can call with a np.ndarray assert pyfunc_model.predict(pd.DataFrame(test_data).values).equals( pd.DataFrame(test_data)) # Or with column names! pdf = pd.DataFrame(data=test_data, columns=["a", "b", "c"]) assert pyfunc_model.predict(pdf).equals(pdf) # Must provide the right number of arguments with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[1.0, 2.0]]) assert "the provided input only has 2 columns." in str(ex) # Must provide the right types with pytest.raises(MlflowException) as ex: pyfunc_model.predict([[1, 2, 3]]) assert "Can not safely convert int64 to float64" in str(ex) # Can only provide data type that can be converted to dataframe... with pytest.raises(MlflowException) as ex: pyfunc_model.predict(set([1, 2, 3])) assert "Expected input to be DataFrame or list. Found: set" in str(ex) # 9. dictionaries of str -> list/nparray work d = {"a": [1.0], "b": [2.0], "c": [3.0]} assert pyfunc_model.predict(d).equals(pd.DataFrame(d))
def test_schema_enforcement(): class TestModel(object): @staticmethod def predict(pdf): return pdf m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[1, 2, 3, 4, True, "x", bytes([1])]], columns=["b", "d", "a", "c", "e", "g", "f"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f"]]) assert "Model input is missing columns" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) expected_types = dict( zip(input_schema.column_names(), input_schema.pandas_types())) actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 4. float -> double works pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.int64) # 5. floats -> ints raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 6. ints -> floats raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 7. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types
def test_tensor_multi_named_schema_enforcement(): m = Model() input_schema = Schema([ TensorSpec(np.dtype(np.uint64), (-1, 5), "a"), TensorSpec(np.dtype(np.short), (-1, 2), "b"), TensorSpec(np.dtype(np.float32), (2, -1, 2), "c"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) inp = { "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]], dtype=np.uint64), "b": np.array([[0, 0], [1, 1], [2, 2]], dtype=np.short), "c": np.array([[[0, 0], [1, 1]], [[2, 2], [3, 3]]], dtype=np.float32), } # test that missing column raises inp1 = {k: v for k, v in inp.items()} with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp1.pop("b")) assert "Model is missing inputs" in str(ex) # test that extra column is ignored inp2 = {k: v for k, v in inp.items()} inp2["x"] = 1 # test that extra column is removed res = pyfunc_model.predict(inp2) assert res == {k: v for k, v in inp.items() if k in {"a", "b", "c"}} expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test that variable axes are supported inp3 = { "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2]], dtype=np.uint64), "b": np.array([[0, 0], [1, 1]], dtype=np.short), "c": np.array([[[0, 0]], [[2, 2]]], dtype=np.float32), } res = pyfunc_model.predict(inp3) assert _compare_exact_tensor_dict_input(res, inp3) expected_types = dict( zip(input_schema.input_names(), input_schema.input_types())) actual_types = {k: v.dtype for k, v in res.items()} assert expected_types == actual_types # test that type casting is not supported inp4 = {k: v for k, v in inp.items()} inp4["a"] = inp4["a"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp4) assert "dtype of input int32 does not match expected dtype uint64" in str( ex) # test wrong shape inp5 = { "a": np.array([[0, 0, 0, 0]], dtype=np.uint), "b": np.array([[0, 0], [1, 1]], dtype=np.short), "c": np.array([[[0, 0]]], dtype=np.float32), } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp5) assert "Shape of input (1, 4) does not match expected shape (-1, 5)" in str( ex) # test non-dictionary input inp6 = [ np.array([[0, 0, 0, 0, 0]], dtype=np.uint64), np.array([[0, 0], [1, 1]], dtype=np.short), np.array([[[0, 0]]], dtype=np.float32), ] with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp6) assert "Model is missing inputs ['a', 'b', 'c']." in str(ex) # test empty ndarray does not work inp7 = {k: v for k, v in inp.items()} inp7["a"] = np.array([]) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp7) assert "Shape of input (0,) does not match expected shape" in str(ex) # test dictionary of str -> list does not work inp8 = {k: list(v) for k, v in inp.items()} with pytest.raises(MlflowException) as ex: pyfunc_model.predict(inp8) assert "This model contains a tensor-based model signature with input names" in str( ex) assert ( "suggests a dictionary input mapping input name to a numpy array, but a dict" " with value type <class 'list'> was found") in str(ex) # test dataframe input fails at shape enforcement pdf = pd.DataFrame( data=[[1, 2, 3]], columns=["a", "b", "c"], ) pdf["a"] = pdf["a"].astype(np.uint64) pdf["b"] = pdf["b"].astype(np.short) pdf["c"] = pdf["c"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Shape of input (1,) does not match expected shape (-1, 5)" in str( ex)
def test_column_schema_enforcement(): m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ColSpec("datetime", "h"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[ 1, 2, 3, 4, True, "x", bytes([1]), "2021-01-01 00:00:00.1234567" ]], columns=["b", "d", "a", "c", "e", "g", "f", "h"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) pdf["h"] = pdf["h"].astype(np.datetime64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f", "h"]]) assert "Model is missing inputs" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) expected_types = dict( zip(input_schema.input_names(), input_schema.pandas_types())) # MLflow datetime type in input_schema does not encode precision, so add it for assertions expected_types["h"] = np.dtype("datetime64[ns]") actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. unsigned int -> long works pdf["b"] = pdf["b"].astype(np.uint32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 4. unsigned int -> int raises pdf["a"] = pdf["a"].astype(np.uint32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 5. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 6. float -> double works, double -> float does not pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 7. int -> float raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 8. int -> double works pdf["d"] = pdf["d"].astype(np.int32) pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types # 9. long -> double raises pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["b"] = pdf["b"].astype(np.int64) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 11. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types # 12. datetime64[D] (date only) -> datetime64[x] works pdf["h"] = pdf["h"].astype("datetime64[D]") res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types pdf["h"] = pdf["h"].astype("datetime64[s]") # 13. np.ndarrays can be converted to dataframe but have no columns with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf.values) assert "Model is missing inputs" in str(ex) # 14. dictionaries of str -> list/nparray work arr = np.array([1, 2, 3]) d = { "a": arr.astype("int32"), "b": arr.astype("int64"), "c": arr.astype("float32"), "d": arr.astype("float64"), "e": [True, False, True], "g": ["a", "b", "c"], "f": [bytes(0), bytes(1), bytes(1)], "h": np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64), } res = pyfunc_model.predict(d) assert res.dtypes.to_dict() == expected_types # 15. dictionaries of str -> list[list] fail d = { "a": [arr.astype("int32")], "b": [arr.astype("int64")], "c": [arr.astype("float32")], "d": [arr.astype("float64")], "e": [[True, False, True]], "g": [["a", "b", "c"]], "f": [[bytes(0), bytes(1), bytes(1)]], "h": [ np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64) ], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "Incompatible input types" in str(ex) # 16. conversion to dataframe fails d = { "a": [1], "b": [1, 2], "c": [1, 2, 3], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "This model contains a column-based signature, which suggests a DataFrame input." in str( ex)
# Prepare dataset try: repo_url = "https://raw.githubusercontent.com/prinz-nussknacker" csv_url = f"{repo_url}/banksim1/master/bs140513_032310.csv" data = pd.read_csv(csv_url, sep=",", quotechar="'", header=0) except Exception as e: logger.exception("Could not read CSV file: {}".format(e)) exit(1) data.dropna() data = data.drop(["step", "customer", "zipcodeOri", "merchant", "zipMerchant"], axis="columns") input_schema = Schema([ ColSpec("string", "age"), ColSpec("string", "gender"), ColSpec("string", "category"), ColSpec("double", "amount") ]) output_schema = Schema([ColSpec("integer")]) signature = ModelSignature(inputs=input_schema, outputs=output_schema) # Prepare train and test sets data_x = data.drop(["fraud"], axis="columns") data_y = data[["fraud"]] train_x, test_x, train_y, test_y = train_test_split(data_x, data_y) with mlflow.start_run(): # Define pipeline numeric_features = ['amount'] numeric_transformer = Pipeline( steps=[('imputer',
def test_spark_udf_autofills_no_arguments(spark): class TestModel(PythonModel): def predict(self, context, model_input): return [model_input.columns] * len(model_input) signature = ModelSignature( inputs=Schema( [ColSpec("long", "a"), ColSpec("long", "b"), ColSpec("long", "c")]), outputs=Schema([ColSpec("integer")]), ) good_data = spark.createDataFrame( pd.DataFrame(columns=["a", "b", "c", "d"], data={ "a": [1], "b": [2], "c": [3], "d": [4] })) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature) udf = mlflow.pyfunc.spark_udf(spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())) res = good_data.withColumn("res", udf()).select("res").toPandas() assert res["res"][0] == ["a", "b", "c"] with pytest.raises( pyspark.sql.utils.PythonException, match= r"Model input is missing columns. Expected 3 input columns", ): res = good_data.withColumn("res", udf("b", "c")).select("res").toPandas() # this dataframe won't work because it's missing column a bad_data = spark.createDataFrame( pd.DataFrame(columns=["x", "b", "c", "d"], data={ "x": [1], "b": [2], "c": [3], "d": [4] })) with pytest.raises(AnalysisException, match=r"cannot resolve 'a' given input columns"): bad_data.withColumn("res", udf()) nameless_signature = ModelSignature( inputs=Schema([ColSpec("long"), ColSpec("long"), ColSpec("long")]), outputs=Schema([ColSpec("integer")]), ) with mlflow.start_run() as run: mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=nameless_signature) udf = mlflow.pyfunc.spark_udf(spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())) with pytest.raises( MlflowException, match=r"Cannot apply udf because no column names specified", ): good_data.withColumn("res", udf()) with mlflow.start_run() as run: # model without signature mlflow.pyfunc.log_model("model", python_model=TestModel()) udf = mlflow.pyfunc.spark_udf(spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())) with pytest.raises(pyspark.sql.utils.PythonException): res = good_data.withColumn("res", udf()).select("res").toPandas()
def test_dataframe_from_json(): source = pd.DataFrame( { "boolean": [True, False, True], "string": ["a", "b", "c"], "float": np.array([1.2, 2.3, 3.4], dtype=np.float32), "double": np.array([1.2, 2.3, 3.4], dtype=np.float64), "integer": np.array([3, 4, 5], dtype=np.int32), "long": np.array([3, 4, 5], dtype=np.int64), "binary": [bytes([1, 2, 3]), bytes([4, 5]), bytes([6])], "date_string": ["2018-02-03", "1996-03-02", "2021-03-05"], }, columns=[ "boolean", "string", "float", "double", "integer", "long", "binary", "date_string", ], ) jsonable_df = pd.DataFrame(source, copy=True) jsonable_df["binary"] = jsonable_df["binary"].map(base64.b64encode) schema = Schema([ ColSpec("boolean", "boolean"), ColSpec("string", "string"), ColSpec("float", "float"), ColSpec("double", "double"), ColSpec("integer", "integer"), ColSpec("long", "long"), ColSpec("binary", "binary"), ColSpec("string", "date_string"), ]) parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"), pandas_orient="split", schema=schema) assert parsed.equals(source) parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"), pandas_orient="records", schema=schema) assert parsed.equals(source) # try parsing with tensor schema tensor_schema = Schema([ TensorSpec(np.dtype("bool"), [-1], "boolean"), TensorSpec(np.dtype("str"), [-1], "string"), TensorSpec(np.dtype("float32"), [-1], "float"), TensorSpec(np.dtype("float64"), [-1], "double"), TensorSpec(np.dtype("int32"), [-1], "integer"), TensorSpec(np.dtype("int64"), [-1], "long"), TensorSpec(np.dtype(bytes), [-1], "binary"), ]) parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"), pandas_orient="split", schema=tensor_schema) # NB: tensor schema does not automatically decode base64 encoded bytes. assert parsed.equals(jsonable_df) parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"), pandas_orient="records", schema=tensor_schema) # NB: tensor schema does not automatically decode base64 encoded bytes. assert parsed.equals(jsonable_df) # Test parse with TesnorSchema with a single tensor tensor_schema = Schema([TensorSpec(np.dtype("float32"), [-1, 3])]) source = pd.DataFrame( { "a": np.array([1, 2, 3], dtype=np.float32), "b": np.array([4.1, 5.2, 6.3], dtype=np.float32), "c": np.array([7, 8, 9], dtype=np.float32), }, columns=["a", "b", "c"], ) assert source.equals( _dataframe_from_json(source.to_json(orient="split"), pandas_orient="split", schema=tensor_schema)) assert source.equals( _dataframe_from_json(source.to_json(orient="records"), pandas_orient="records", schema=tensor_schema))