def parse_csv_input(csv_input, schema: Schema = None): """ :param csv_input: A CSV-formatted string representation of a Pandas DataFrame, or a stream containing such a string representation. :param schema: Optional schema specification to be used during parsing. """ try: if schema is None: return pd.read_csv(csv_input) else: dtypes = dict(zip(schema.input_names(), schema.pandas_types())) return pd.read_csv(csv_input, dtype=dtypes) except Exception: _handle_serving_error( error_message= ("Failed to parse input as a Pandas DataFrame. Ensure that the input is" " a valid CSV-formatted Pandas DataFrame produced using the" " `pandas.DataFrame.to_csv()` method."), error_code=BAD_REQUEST, )
def test_schema_enforcement(): class TestModel(object): @staticmethod def predict(pdf): return pdf m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[1, 2, 3, 4, True, "x", bytes([1])]], columns=["b", "d", "a", "c", "e", "g", "f"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f"]]) assert "Model input is missing columns" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) expected_types = dict( zip(input_schema.column_names(), input_schema.pandas_types())) actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.column_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 4. float -> double works pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.int64) # 5. floats -> ints raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 6. ints -> floats raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 7. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types
def test_column_schema_enforcement(): m = Model() input_schema = Schema([ ColSpec("integer", "a"), ColSpec("long", "b"), ColSpec("float", "c"), ColSpec("double", "d"), ColSpec("boolean", "e"), ColSpec("string", "g"), ColSpec("binary", "f"), ColSpec("datetime", "h"), ]) m.signature = ModelSignature(inputs=input_schema) pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel()) pdf = pd.DataFrame( data=[[ 1, 2, 3, 4, True, "x", bytes([1]), "2021-01-01 00:00:00.1234567" ]], columns=["b", "d", "a", "c", "e", "g", "f", "h"], dtype=np.object, ) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.int64) pdf["c"] = pdf["c"].astype(np.float32) pdf["d"] = pdf["d"].astype(np.float64) pdf["h"] = pdf["h"].astype(np.datetime64) # test that missing column raises with pytest.raises(MlflowException) as ex: res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f", "h"]]) assert "Model is missing inputs" in str(ex) # test that extra column is ignored pdf["x"] = 1 # test that columns are reordered, extra column is ignored res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) expected_types = dict( zip(input_schema.input_names(), input_schema.pandas_types())) # MLflow datetime type in input_schema does not encode precision, so add it for assertions expected_types["h"] = np.dtype("datetime64[ns]") actual_types = res.dtypes.to_dict() assert expected_types == actual_types # Test conversions # 1. long -> integer raises pdf["a"] = pdf["a"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 2. integer -> long works pdf["b"] = pdf["b"].astype(np.int32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 3. unsigned int -> long works pdf["b"] = pdf["b"].astype(np.uint32) res = pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types pdf["b"] = pdf["b"].astype(np.int64) # 4. unsigned int -> int raises pdf["a"] = pdf["a"].astype(np.uint32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) # 5. double -> float raises pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 6. float -> double works, double -> float does not pdf["d"] = pdf["d"].astype(np.float32) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) pdf["c"] = pdf["c"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 7. int -> float raises pdf["c"] = pdf["c"].astype(np.int32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["c"] = pdf["c"].astype(np.float32) # 8. int -> double works pdf["d"] = pdf["d"].astype(np.int32) pyfunc_model.predict(pdf) assert all((res == pdf[input_schema.input_names()]).all()) assert res.dtypes.to_dict() == expected_types # 9. long -> double raises pdf["d"] = pdf["d"].astype(np.int64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["d"] = pdf["d"].astype(np.float64) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float32) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) # 10. any float -> any int raises pdf["a"] = pdf["a"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["a"] = pdf["a"].astype(np.int32) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) assert "Incompatible input types" in str(ex) pdf["b"] = pdf["b"].astype(np.int64) pdf["b"] = pdf["b"].astype(np.float64) with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf) pdf["b"] = pdf["b"].astype(np.int64) assert "Incompatible input types" in str(ex) # 11. objects work pdf["b"] = pdf["b"].astype(np.object) pdf["d"] = pdf["d"].astype(np.object) pdf["e"] = pdf["e"].astype(np.object) pdf["f"] = pdf["f"].astype(np.object) pdf["g"] = pdf["g"].astype(np.object) res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types # 12. datetime64[D] (date only) -> datetime64[x] works pdf["h"] = pdf["h"].astype("datetime64[D]") res = pyfunc_model.predict(pdf) assert res.dtypes.to_dict() == expected_types pdf["h"] = pdf["h"].astype("datetime64[s]") # 13. np.ndarrays can be converted to dataframe but have no columns with pytest.raises(MlflowException) as ex: pyfunc_model.predict(pdf.values) assert "Model is missing inputs" in str(ex) # 14. dictionaries of str -> list/nparray work arr = np.array([1, 2, 3]) d = { "a": arr.astype("int32"), "b": arr.astype("int64"), "c": arr.astype("float32"), "d": arr.astype("float64"), "e": [True, False, True], "g": ["a", "b", "c"], "f": [bytes(0), bytes(1), bytes(1)], "h": np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64), } res = pyfunc_model.predict(d) assert res.dtypes.to_dict() == expected_types # 15. dictionaries of str -> list[list] fail d = { "a": [arr.astype("int32")], "b": [arr.astype("int64")], "c": [arr.astype("float32")], "d": [arr.astype("float64")], "e": [[True, False, True]], "g": [["a", "b", "c"]], "f": [[bytes(0), bytes(1), bytes(1)]], "h": [ np.array(["2020-01-01", "2020-02-02", "2020-03-03"], dtype=np.datetime64) ], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "Incompatible input types" in str(ex) # 16. conversion to dataframe fails d = { "a": [1], "b": [1, 2], "c": [1, 2, 3], } with pytest.raises(MlflowException) as ex: pyfunc_model.predict(d) assert "This model contains a column-based signature, which suggests a DataFrame input." in str( ex)