Esempio n. 1
0
def test_serving_model_with_schema(pandas_df_with_all_types):
    class TestModel(PythonModel):
        def predict(self, context, model_input):
            return [[k, str(v)] for k, v in model_input.dtypes.items()]

    schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns])
    df = _shuffle_pdf(pandas_df_with_all_types)
    with TempDir(chdr=True):
        with mlflow.start_run() as run:
            mlflow.pyfunc.log_model("model",
                                    python_model=TestModel(),
                                    signature=ModelSignature(schema))
        response = pyfunc_serve_and_score_model(
            model_uri="runs:/{}/model".format(run.info.run_id),
            data=json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder),
            content_type=pyfunc_scoring_server.
            CONTENT_TYPE_JSON_SPLIT_ORIENTED,
            extra_args=["--no-conda"],
        )
        response_json = json.loads(response.content)
        assert response_json == [
            [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items()
        ]
        response = pyfunc_serve_and_score_model(
            model_uri="runs:/{}/model".format(run.info.run_id),
            data=json.dumps(pandas_df_with_all_types.to_dict(orient="records"),
                            cls=NumpyEncoder),
            content_type=pyfunc_scoring_server.
            CONTENT_TYPE_JSON_RECORDS_ORIENTED,
            extra_args=["--no-conda"],
        )
        response_json = json.loads(response.content)
        assert response_json == [
            [k, str(v)] for k, v in pandas_df_with_all_types.dtypes.items()
        ]
def test_schema_enforcement_named_tensor_schema_1d():
    m = Model()
    input_schema = Schema([
        TensorSpec(np.dtype(np.uint64), (-1, ), "a"),
        TensorSpec(np.dtype(np.float32), (-1, ), "b")
    ])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    pdf = pd.DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"])
    pdf["a"] = pdf["a"].astype(np.uint64)
    pdf["b"] = pdf["a"].astype(np.float32)
    d_inp = {
        "a": np.array(pdf["a"], dtype=np.uint64),
        "b": np.array(pdf["b"], dtype=np.float32),
    }

    # test dataframe input works for 1d tensor specs and input is converted to dict
    res = pyfunc_model.predict(pdf)
    assert _compare_exact_tensor_dict_input(res, d_inp)
    expected_types = dict(
        zip(input_schema.input_names(), input_schema.input_types()))
    actual_types = {k: v.dtype for k, v in res.items()}
    assert expected_types == actual_types

    # test that dictionary works too
    res = pyfunc_model.predict(d_inp)
    assert res == d_inp
    expected_types = dict(
        zip(input_schema.input_names(), input_schema.input_types()))
    actual_types = {k: v.dtype for k, v in res.items()}
    assert expected_types == actual_types
def test_missing_value_hint_is_displayed_when_it_should():
    m = Model()
    input_schema = Schema([ColSpec("integer", "a")])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    pdf = pd.DataFrame(
        data=[[1], [None]],
        columns=["a"],
    )
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    hint = "Hint: the type mismatch is likely caused by missing values."
    assert "Incompatible input types" in str(ex.value.message)
    assert hint in str(ex.value.message)
    pdf = pd.DataFrame(
        data=[[1.5], [None]],
        columns=["a"],
    )
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    assert hint not in str(ex.value.message)
    pdf = pd.DataFrame(data=[[1], [2]], columns=["a"], dtype=np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex.value.message)
    assert hint not in str(ex.value.message)
def test_schema_enforcement_single_named_tensor_schema():
    m = Model()
    input_schema = Schema([TensorSpec(np.dtype(np.uint64), (-1, 2), "a")])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    inp = {
        "a": np.array([[0, 0], [1, 1]], dtype=np.uint64),
    }

    # sanity test that dictionary with correct input works
    res = pyfunc_model.predict(inp)
    assert res == inp
    expected_types = dict(
        zip(input_schema.input_names(), input_schema.input_types()))
    actual_types = {k: v.dtype for k, v in res.items()}
    assert expected_types == actual_types

    # test single np.ndarray input works and is converted to dictionary
    res = pyfunc_model.predict(inp["a"])
    assert res == inp
    expected_types = dict(
        zip(input_schema.input_names(), input_schema.input_types()))
    actual_types = {k: v.dtype for k, v in res.items()}
    assert expected_types == actual_types

    # test list does not work
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict([[0, 0], [1, 1]])
    assert "Model is missing inputs ['a']" in str(ex)
def test_parse_with_schema(pandas_df_with_all_types):
    schema = Schema([ColSpec(c, c) for c in pandas_df_with_all_types.columns])
    df = _shuffle_pdf(pandas_df_with_all_types)
    json_str = json.dumps(df.to_dict(orient="split"), cls=NumpyEncoder)
    df = pyfunc_scoring_server.parse_json_input(json_str, orient="split", schema=schema)
    json_str = json.dumps(df.to_dict(orient="records"), cls=NumpyEncoder)
    df = pyfunc_scoring_server.parse_json_input(json_str, orient="records", schema=schema)
    assert schema == infer_signature(df[schema.column_names()]).inputs

    # The current behavior with pandas json parse with type hints is weird. In some cases, the
    # types are forced ignoting overflow and loss of precision:

    bad_df = """{
      "columns":["bad_integer", "bad_float", "bad_string", "bad_boolean"],
      "data":[
        [9007199254740991.0, 1.1,                1, 1.5],
        [9007199254740992.0, 9007199254740992.0, 2, 0],
        [9007199254740994.0, 3.3,                3, "some arbitrary string"]
      ]
    }"""
    schema = Schema(
        [
            ColSpec("integer", "bad_integer"),
            ColSpec("float", "bad_float"),
            ColSpec("float", "good_float"),
            ColSpec("string", "bad_string"),
            ColSpec("boolean", "bad_boolean"),
        ]
    )
    df = pyfunc_scoring_server.parse_json_input(bad_df, orient="split", schema=schema)
    # Unfortunately, the current behavior of pandas parse is to force numbers to int32 even if
    # they don't fit:
    assert df["bad_integer"].dtype == np.int32
    assert all(df["bad_integer"] == [-2147483648, -2147483648, -2147483648])

    # The same goes for floats:
    assert df["bad_float"].dtype == np.float32
    assert all(df["bad_float"] == np.array([1.1, 9007199254740992, 3.3], dtype=np.float32))
    # However bad string is recognized as int64:
    assert all(df["bad_string"] == np.array([1, 2, 3], dtype=np.object))

    # Boolean is forced - zero and empty string is false, everything else is true:
    assert df["bad_boolean"].dtype == np.bool
    assert all(df["bad_boolean"] == [True, False, True])
def test_tensor_schema_enforcement_no_col_names():
    m = Model()
    input_schema = Schema([TensorSpec(np.dtype(np.float32), (-1, 3))])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    test_data = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)

    # Can call with numpy array of correct shape
    assert np.array_equal(pyfunc_model.predict(test_data), test_data)

    # Or can call with a dataframe
    assert np.array_equal(pyfunc_model.predict(pd.DataFrame(test_data)),
                          test_data)

    # Can not call with a list
    with pytest.raises(
            MlflowException,
            match=
            "This model contains a tensor-based model signature with no input names",
    ):
        pyfunc_model.predict([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

    # Can not call with a dict
    with pytest.raises(
            MlflowException,
            match=
            "This model contains a tensor-based model signature with no input names",
    ):
        pyfunc_model.predict({"blah": test_data})

    # Can not call with a np.ndarray of a wrong shape
    with pytest.raises(
            MlflowException,
            match=re.escape(
                "Shape of input (2, 2) does not match expected shape (-1, 3)"),
    ):
        pyfunc_model.predict(np.array([[1.0, 2.0], [4.0, 5.0]]))

    # Can not call with a np.ndarray of a wrong type
    with pytest.raises(
            MlflowException,
            match="dtype of input uint32 does not match expected dtype float32"
    ):
        pyfunc_model.predict(test_data.astype(np.uint32))

    # Can call with a np.ndarray with more elements along variable axis
    test_data2 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]],
                          dtype=np.float32)
    assert np.array_equal(pyfunc_model.predict(test_data2), test_data2)

    # Can not call with an empty ndarray
    with pytest.raises(
            MlflowException,
            match=re.escape(
                "Shape of input () does not match expected shape (-1, 3)")):
        pyfunc_model.predict(np.ndarray([]))
Esempio n. 7
0
def test_parse_tf_serving_dictionary():
    def assert_result(result, expected_result):
        assert result.keys() == expected_result.keys()
        for key in result:
            assert (result[key] == expected_result[key]).all()

    # instances are correctly aggregated to dict of input name -> tensor
    tfserving_input = {
        "instances": [
            {"a": "s1", "b": 1.1, "c": [1, 2, 3]},
            {"a": "s2", "b": 2.2, "c": [4, 5, 6]},
            {"a": "s3", "b": 3.3, "c": [7, 8, 9]},
        ]
    }
    # Without Schema
    result = parse_tf_serving_input(tfserving_input)
    expected_result_no_schema = {
        "a": np.array(["s1", "s2", "s3"]),
        "b": np.array([1.1, 2.2, 3.3], dtype="float64"),
        "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype="int64"),
    }
    assert_result(result, expected_result_no_schema)

    # With schema
    schema = Schema(
        [
            TensorSpec(np.dtype("object"), [-1], "a"),
            TensorSpec(np.dtype("float32"), [-1], "b"),
            TensorSpec(np.dtype("int32"), [-1], "c"),
        ]
    )
    result = parse_tf_serving_input(tfserving_input, schema)
    expected_result_schema = {
        "a": np.array(["s1", "s2", "s3"]),
        "b": np.array([1.1, 2.2, 3.3], dtype="float32"),
        "c": np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype="int32"),
    }
    assert_result(result, expected_result_schema)

    # input provided as a dict
    tfserving_input = {
        "inputs": {
            "a": ["s1", "s2", "s3"],
            "b": [1.1, 2.2, 3.3],
            "c": [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
        }
    }
    # Without Schema
    result = parse_tf_serving_input(tfserving_input)
    assert_result(result, expected_result_no_schema)

    # With Schema
    result = parse_tf_serving_input(tfserving_input, schema)
    assert_result(result, expected_result_schema)
Esempio n. 8
0
def test_spark_udf_with_datetime_columns(spark):
    class TestModel(PythonModel):
        def predict(self, context, model_input):
            return [model_input.columns] * len(model_input)

    signature = ModelSignature(
        inputs=Schema([ColSpec("datetime", "timestamp"), ColSpec("datetime", "date")]),
        outputs=Schema([ColSpec("integer")]),
    )
    with mlflow.start_run() as run:
        mlflow.pyfunc.log_model("model", python_model=TestModel(), signature=signature)
        udf = mlflow.pyfunc.spark_udf(
            spark, "runs:/{}/model".format(run.info.run_id), result_type=ArrayType(StringType())
        )
        data = spark.range(10).selectExpr(
            "current_timestamp() as timestamp", "current_date() as date"
        )

        res = data.withColumn("res", udf("timestamp", "date")).select("res")
        res = res.toPandas()
        assert res["res"][0] == ["timestamp", "date"]
Esempio n. 9
0
def test_parse_tf_serving_single_array():
    def assert_result(result, expected_result):
        assert (result == expected_result).all()

    # values for each column are properly converted to a tensor
    arr = [
        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
        [[3, 2, 1], [6, 5, 4], [9, 8, 7]],
    ]
    tfserving_instances = {"instances": arr}
    tfserving_inputs = {"inputs": arr}

    # Without schema
    instance_result = parse_tf_serving_input(tfserving_instances)
    assert instance_result.shape == (2, 3, 3)
    assert_result(instance_result, np.array(arr, dtype="int64"))

    input_result = parse_tf_serving_input(tfserving_inputs)
    assert input_result.shape == (2, 3, 3)
    assert_result(input_result, np.array(arr, dtype="int64"))

    # Unnamed schema
    schema = Schema([TensorSpec(np.dtype("float32"), [-1])])
    instance_result = parse_tf_serving_input(tfserving_instances, schema)
    assert_result(instance_result, np.array(arr, dtype="float32"))

    input_result = parse_tf_serving_input(tfserving_inputs, schema)
    assert_result(input_result, np.array(arr, dtype="float32"))

    # named schema
    schema = Schema([TensorSpec(np.dtype("float32"), [-1], "a")])
    instance_result = parse_tf_serving_input(tfserving_instances, schema)
    assert isinstance(instance_result, dict)
    assert len(instance_result.keys()) == 1 and "a" in instance_result
    assert_result(instance_result["a"], np.array(arr, dtype="float32"))

    input_result = parse_tf_serving_input(tfserving_inputs, schema)
    assert isinstance(input_result, dict)
    assert len(input_result.keys()) == 1 and "a" in input_result
    assert_result(input_result["a"], np.array(arr, dtype="float32"))
Esempio n. 10
0
def test_spark_udf_autofills_column_names_with_schema(spark):
    class TestModel(PythonModel):
        def predict(self, context, model_input):
            return [model_input.columns] * len(model_input)

    signature = ModelSignature(
        inputs=Schema(
            [ColSpec("long", "a"),
             ColSpec("long", "b"),
             ColSpec("long", "c")]),
        outputs=Schema([ColSpec("integer")]),
    )
    with mlflow.start_run() as run:
        mlflow.pyfunc.log_model("model",
                                python_model=TestModel(),
                                signature=signature)
        udf = mlflow.pyfunc.spark_udf(spark,
                                      "runs:/{}/model".format(run.info.run_id),
                                      result_type=ArrayType(StringType()))
        data = spark.createDataFrame(
            pd.DataFrame(columns=["a", "b", "c", "d"],
                         data={
                             "a": [1],
                             "b": [2],
                             "c": [3],
                             "d": [4]
                         }))
        with pytest.raises(Py4JJavaError):
            res = data.withColumn("res1", udf("a",
                                              "b")).select("res1").toPandas()

        res = data.withColumn("res2", udf("a", "b",
                                          "c")).select("res2").toPandas()
        assert res["res2"][0] == ["a", "b", "c"]
        res = data.withColumn("res4", udf("a", "b", "c",
                                          "d")).select("res4").toPandas()
        assert res["res4"][0] == ["a", "b", "c"]
Esempio n. 11
0
def test_schema_enforcement_no_col_names():
    class TestModel(object):
        @staticmethod
        def predict(pdf):
            return pdf

    m = Model()
    input_schema = Schema(
        [ColSpec("double"),
         ColSpec("double"),
         ColSpec("double")])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    test_data = [[1.0, 2.0, 3.0]]

    # Can call with just a list
    assert pyfunc_model.predict(test_data).equals(pd.DataFrame(test_data))

    # Or can call with a DataFrame without column names
    assert pyfunc_model.predict(pd.DataFrame(test_data)).equals(
        pd.DataFrame(test_data))

    # # Or can call with a np.ndarray
    assert pyfunc_model.predict(pd.DataFrame(test_data).values).equals(
        pd.DataFrame(test_data))

    # Or with column names!
    pdf = pd.DataFrame(data=test_data, columns=["a", "b", "c"])
    assert pyfunc_model.predict(pdf).equals(pdf)

    # Must provide the right number of arguments
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict([[1.0, 2.0]])
    assert "the provided input only has 2 columns." in str(ex)

    # Must provide the right types
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict([[1, 2, 3]])
    assert "Can not safely convert int64 to float64" in str(ex)

    # Can only provide data type that can be converted to dataframe...
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(set([1, 2, 3]))
    assert "Expected input to be DataFrame or list. Found: set" in str(ex)

    # 9. dictionaries of str -> list/nparray work
    d = {"a": [1.0], "b": [2.0], "c": [3.0]}
    assert pyfunc_model.predict(d).equals(pd.DataFrame(d))
Esempio n. 12
0
def test_schema_enforcement():
    class TestModel(object):
        @staticmethod
        def predict(pdf):
            return pdf

    m = Model()
    input_schema = Schema([
        ColSpec("integer", "a"),
        ColSpec("long", "b"),
        ColSpec("float", "c"),
        ColSpec("double", "d"),
        ColSpec("boolean", "e"),
        ColSpec("string", "g"),
        ColSpec("binary", "f"),
    ])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    pdf = pd.DataFrame(
        data=[[1, 2, 3, 4, True, "x", bytes([1])]],
        columns=["b", "d", "a", "c", "e", "g", "f"],
        dtype=np.object,
    )
    pdf["a"] = pdf["a"].astype(np.int32)
    pdf["b"] = pdf["b"].astype(np.int64)
    pdf["c"] = pdf["c"].astype(np.float32)
    pdf["d"] = pdf["d"].astype(np.float64)
    # test that missing column raises
    with pytest.raises(MlflowException) as ex:
        res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f"]])
    assert "Model input is missing columns" in str(ex)

    # test that extra column is ignored
    pdf["x"] = 1

    # test that columns are reordered, extra column is ignored
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.column_names()]).all())

    expected_types = dict(
        zip(input_schema.column_names(), input_schema.pandas_types()))
    actual_types = res.dtypes.to_dict()
    assert expected_types == actual_types

    # Test conversions
    # 1. long -> integer raises
    pdf["a"] = pdf["a"].astype(np.int64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)
    # 2. integer -> long works
    pdf["b"] = pdf["b"].astype(np.int32)
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.column_names()]).all())
    assert res.dtypes.to_dict() == expected_types
    pdf["b"] = pdf["b"].astype(np.int64)

    # 3. double -> float raises
    pdf["c"] = pdf["c"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    # 4. float -> double works
    pdf["d"] = pdf["d"].astype(np.float32)
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types
    assert "Incompatible input types" in str(ex)
    pdf["d"] = pdf["d"].astype(np.int64)

    # 5. floats -> ints raises
    pdf["c"] = pdf["c"].astype(np.int32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    pdf["d"] = pdf["d"].astype(np.int64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["d"] = pdf["d"].astype(np.float64)

    # 6. ints -> floats raises
    pdf["a"] = pdf["a"].astype(np.float32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)

    pdf["b"] = pdf["b"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    pdf["b"] = pdf["b"].astype(np.int64)
    assert "Incompatible input types" in str(ex)

    # 7. objects work
    pdf["b"] = pdf["b"].astype(np.object)
    pdf["d"] = pdf["d"].astype(np.object)
    pdf["e"] = pdf["e"].astype(np.object)
    pdf["f"] = pdf["f"].astype(np.object)
    pdf["g"] = pdf["g"].astype(np.object)
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types
Esempio n. 13
0
def test_tensor_multi_named_schema_enforcement():
    m = Model()
    input_schema = Schema([
        TensorSpec(np.dtype(np.uint64), (-1, 5), "a"),
        TensorSpec(np.dtype(np.short), (-1, 2), "b"),
        TensorSpec(np.dtype(np.float32), (2, -1, 2), "c"),
    ])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    inp = {
        "a": np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1]], dtype=np.uint64),
        "b": np.array([[0, 0], [1, 1], [2, 2]], dtype=np.short),
        "c": np.array([[[0, 0], [1, 1]], [[2, 2], [3, 3]]], dtype=np.float32),
    }

    # test that missing column raises
    inp1 = {k: v for k, v in inp.items()}
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(inp1.pop("b"))
    assert "Model is missing inputs" in str(ex)

    # test that extra column is ignored
    inp2 = {k: v for k, v in inp.items()}
    inp2["x"] = 1

    # test that extra column is removed
    res = pyfunc_model.predict(inp2)
    assert res == {k: v for k, v in inp.items() if k in {"a", "b", "c"}}
    expected_types = dict(
        zip(input_schema.input_names(), input_schema.input_types()))
    actual_types = {k: v.dtype for k, v in res.items()}
    assert expected_types == actual_types

    # test that variable axes are supported
    inp3 = {
        "a":
        np.array([[0, 0, 0, 0, 0], [1, 1, 1, 1, 1], [2, 2, 2, 2, 2]],
                 dtype=np.uint64),
        "b":
        np.array([[0, 0], [1, 1]], dtype=np.short),
        "c":
        np.array([[[0, 0]], [[2, 2]]], dtype=np.float32),
    }
    res = pyfunc_model.predict(inp3)
    assert _compare_exact_tensor_dict_input(res, inp3)
    expected_types = dict(
        zip(input_schema.input_names(), input_schema.input_types()))
    actual_types = {k: v.dtype for k, v in res.items()}
    assert expected_types == actual_types

    # test that type casting is not supported
    inp4 = {k: v for k, v in inp.items()}
    inp4["a"] = inp4["a"].astype(np.int32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(inp4)
    assert "dtype of input int32 does not match expected dtype uint64" in str(
        ex)

    # test wrong shape
    inp5 = {
        "a": np.array([[0, 0, 0, 0]], dtype=np.uint),
        "b": np.array([[0, 0], [1, 1]], dtype=np.short),
        "c": np.array([[[0, 0]]], dtype=np.float32),
    }
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(inp5)
    assert "Shape of input (1, 4) does not match expected shape (-1, 5)" in str(
        ex)

    # test non-dictionary input
    inp6 = [
        np.array([[0, 0, 0, 0, 0]], dtype=np.uint64),
        np.array([[0, 0], [1, 1]], dtype=np.short),
        np.array([[[0, 0]]], dtype=np.float32),
    ]
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(inp6)
    assert "Model is missing inputs ['a', 'b', 'c']." in str(ex)

    # test empty ndarray does not work
    inp7 = {k: v for k, v in inp.items()}
    inp7["a"] = np.array([])
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(inp7)
    assert "Shape of input (0,) does not match expected shape" in str(ex)

    # test dictionary of str -> list does not work
    inp8 = {k: list(v) for k, v in inp.items()}
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(inp8)
    assert "This model contains a tensor-based model signature with input names" in str(
        ex)
    assert (
        "suggests a dictionary input mapping input name to a numpy array, but a dict"
        " with value type <class 'list'> was found") in str(ex)

    # test dataframe input fails at shape enforcement
    pdf = pd.DataFrame(
        data=[[1, 2, 3]],
        columns=["a", "b", "c"],
    )
    pdf["a"] = pdf["a"].astype(np.uint64)
    pdf["b"] = pdf["b"].astype(np.short)
    pdf["c"] = pdf["c"].astype(np.float32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Shape of input (1,) does not match expected shape (-1, 5)" in str(
        ex)
Esempio n. 14
0
def test_column_schema_enforcement():
    m = Model()
    input_schema = Schema([
        ColSpec("integer", "a"),
        ColSpec("long", "b"),
        ColSpec("float", "c"),
        ColSpec("double", "d"),
        ColSpec("boolean", "e"),
        ColSpec("string", "g"),
        ColSpec("binary", "f"),
        ColSpec("datetime", "h"),
    ])
    m.signature = ModelSignature(inputs=input_schema)
    pyfunc_model = PyFuncModel(model_meta=m, model_impl=TestModel())
    pdf = pd.DataFrame(
        data=[[
            1, 2, 3, 4, True, "x",
            bytes([1]), "2021-01-01 00:00:00.1234567"
        ]],
        columns=["b", "d", "a", "c", "e", "g", "f", "h"],
        dtype=np.object,
    )
    pdf["a"] = pdf["a"].astype(np.int32)
    pdf["b"] = pdf["b"].astype(np.int64)
    pdf["c"] = pdf["c"].astype(np.float32)
    pdf["d"] = pdf["d"].astype(np.float64)
    pdf["h"] = pdf["h"].astype(np.datetime64)
    # test that missing column raises
    with pytest.raises(MlflowException) as ex:
        res = pyfunc_model.predict(pdf[["b", "d", "a", "e", "g", "f", "h"]])
    assert "Model is missing inputs" in str(ex)

    # test that extra column is ignored
    pdf["x"] = 1

    # test that columns are reordered, extra column is ignored
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.input_names()]).all())

    expected_types = dict(
        zip(input_schema.input_names(), input_schema.pandas_types()))
    # MLflow datetime type in input_schema does not encode precision, so add it for assertions
    expected_types["h"] = np.dtype("datetime64[ns]")
    actual_types = res.dtypes.to_dict()
    assert expected_types == actual_types

    # Test conversions
    # 1. long -> integer raises
    pdf["a"] = pdf["a"].astype(np.int64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)
    # 2. integer -> long works
    pdf["b"] = pdf["b"].astype(np.int32)
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.input_names()]).all())
    assert res.dtypes.to_dict() == expected_types
    pdf["b"] = pdf["b"].astype(np.int64)

    # 3. unsigned int -> long works
    pdf["b"] = pdf["b"].astype(np.uint32)
    res = pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.input_names()]).all())
    assert res.dtypes.to_dict() == expected_types
    pdf["b"] = pdf["b"].astype(np.int64)

    # 4. unsigned int -> int raises
    pdf["a"] = pdf["a"].astype(np.uint32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)

    # 5. double -> float raises
    pdf["c"] = pdf["c"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    # 6. float -> double works, double -> float does not
    pdf["d"] = pdf["d"].astype(np.float32)
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types
    assert "Incompatible input types" in str(ex)
    pdf["d"] = pdf["d"].astype(np.float64)
    pdf["c"] = pdf["c"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    # 7. int -> float raises
    pdf["c"] = pdf["c"].astype(np.int32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["c"] = pdf["c"].astype(np.float32)

    # 8. int -> double works
    pdf["d"] = pdf["d"].astype(np.int32)
    pyfunc_model.predict(pdf)
    assert all((res == pdf[input_schema.input_names()]).all())
    assert res.dtypes.to_dict() == expected_types

    # 9. long -> double raises
    pdf["d"] = pdf["d"].astype(np.int64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["d"] = pdf["d"].astype(np.float64)

    # 10. any float -> any int raises
    pdf["a"] = pdf["a"].astype(np.float32)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    # 10. any float -> any int raises
    pdf["a"] = pdf["a"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["a"] = pdf["a"].astype(np.int32)
    pdf["b"] = pdf["b"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    assert "Incompatible input types" in str(ex)
    pdf["b"] = pdf["b"].astype(np.int64)

    pdf["b"] = pdf["b"].astype(np.float64)
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf)
    pdf["b"] = pdf["b"].astype(np.int64)
    assert "Incompatible input types" in str(ex)

    # 11. objects work
    pdf["b"] = pdf["b"].astype(np.object)
    pdf["d"] = pdf["d"].astype(np.object)
    pdf["e"] = pdf["e"].astype(np.object)
    pdf["f"] = pdf["f"].astype(np.object)
    pdf["g"] = pdf["g"].astype(np.object)
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types

    # 12. datetime64[D] (date only) -> datetime64[x] works
    pdf["h"] = pdf["h"].astype("datetime64[D]")
    res = pyfunc_model.predict(pdf)
    assert res.dtypes.to_dict() == expected_types
    pdf["h"] = pdf["h"].astype("datetime64[s]")

    # 13. np.ndarrays can be converted to dataframe but have no columns
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(pdf.values)
    assert "Model is missing inputs" in str(ex)

    # 14. dictionaries of str -> list/nparray work
    arr = np.array([1, 2, 3])
    d = {
        "a":
        arr.astype("int32"),
        "b":
        arr.astype("int64"),
        "c":
        arr.astype("float32"),
        "d":
        arr.astype("float64"),
        "e": [True, False, True],
        "g": ["a", "b", "c"],
        "f": [bytes(0), bytes(1), bytes(1)],
        "h":
        np.array(["2020-01-01", "2020-02-02", "2020-03-03"],
                 dtype=np.datetime64),
    }
    res = pyfunc_model.predict(d)
    assert res.dtypes.to_dict() == expected_types

    # 15. dictionaries of str -> list[list] fail
    d = {
        "a": [arr.astype("int32")],
        "b": [arr.astype("int64")],
        "c": [arr.astype("float32")],
        "d": [arr.astype("float64")],
        "e": [[True, False, True]],
        "g": [["a", "b", "c"]],
        "f": [[bytes(0), bytes(1), bytes(1)]],
        "h": [
            np.array(["2020-01-01", "2020-02-02", "2020-03-03"],
                     dtype=np.datetime64)
        ],
    }
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(d)
    assert "Incompatible input types" in str(ex)

    # 16. conversion to dataframe fails
    d = {
        "a": [1],
        "b": [1, 2],
        "c": [1, 2, 3],
    }
    with pytest.raises(MlflowException) as ex:
        pyfunc_model.predict(d)
    assert "This model contains a column-based signature, which suggests a DataFrame input." in str(
        ex)
Esempio n. 15
0
# Prepare dataset
try:
    repo_url = "https://raw.githubusercontent.com/prinz-nussknacker"
    csv_url = f"{repo_url}/banksim1/master/bs140513_032310.csv"
    data = pd.read_csv(csv_url, sep=",", quotechar="'", header=0)
except Exception as e:
    logger.exception("Could not read CSV file: {}".format(e))
    exit(1)

data.dropna()
data = data.drop(["step", "customer", "zipcodeOri", "merchant", "zipMerchant"],
                 axis="columns")

input_schema = Schema([
    ColSpec("string", "age"),
    ColSpec("string", "gender"),
    ColSpec("string", "category"),
    ColSpec("double", "amount")
])
output_schema = Schema([ColSpec("integer")])
signature = ModelSignature(inputs=input_schema, outputs=output_schema)

# Prepare train and test sets
data_x = data.drop(["fraud"], axis="columns")
data_y = data[["fraud"]]
train_x, test_x, train_y, test_y = train_test_split(data_x, data_y)

with mlflow.start_run():
    # Define pipeline
    numeric_features = ['amount']
    numeric_transformer = Pipeline(
        steps=[('imputer',
Esempio n. 16
0
def test_spark_udf_autofills_no_arguments(spark):
    class TestModel(PythonModel):
        def predict(self, context, model_input):
            return [model_input.columns] * len(model_input)

    signature = ModelSignature(
        inputs=Schema(
            [ColSpec("long", "a"),
             ColSpec("long", "b"),
             ColSpec("long", "c")]),
        outputs=Schema([ColSpec("integer")]),
    )

    good_data = spark.createDataFrame(
        pd.DataFrame(columns=["a", "b", "c", "d"],
                     data={
                         "a": [1],
                         "b": [2],
                         "c": [3],
                         "d": [4]
                     }))
    with mlflow.start_run() as run:
        mlflow.pyfunc.log_model("model",
                                python_model=TestModel(),
                                signature=signature)
        udf = mlflow.pyfunc.spark_udf(spark,
                                      "runs:/{}/model".format(run.info.run_id),
                                      result_type=ArrayType(StringType()))
        res = good_data.withColumn("res", udf()).select("res").toPandas()
        assert res["res"][0] == ["a", "b", "c"]

        with pytest.raises(
                pyspark.sql.utils.PythonException,
                match=
                r"Model input is missing columns. Expected 3 input columns",
        ):
            res = good_data.withColumn("res",
                                       udf("b", "c")).select("res").toPandas()

        # this dataframe won't work because it's missing column a
        bad_data = spark.createDataFrame(
            pd.DataFrame(columns=["x", "b", "c", "d"],
                         data={
                             "x": [1],
                             "b": [2],
                             "c": [3],
                             "d": [4]
                         }))
        with pytest.raises(AnalysisException,
                           match=r"cannot resolve 'a' given input columns"):
            bad_data.withColumn("res", udf())

    nameless_signature = ModelSignature(
        inputs=Schema([ColSpec("long"),
                       ColSpec("long"),
                       ColSpec("long")]),
        outputs=Schema([ColSpec("integer")]),
    )
    with mlflow.start_run() as run:
        mlflow.pyfunc.log_model("model",
                                python_model=TestModel(),
                                signature=nameless_signature)
        udf = mlflow.pyfunc.spark_udf(spark,
                                      "runs:/{}/model".format(run.info.run_id),
                                      result_type=ArrayType(StringType()))
        with pytest.raises(
                MlflowException,
                match=r"Cannot apply udf because no column names specified",
        ):
            good_data.withColumn("res", udf())

    with mlflow.start_run() as run:
        # model without signature
        mlflow.pyfunc.log_model("model", python_model=TestModel())
        udf = mlflow.pyfunc.spark_udf(spark,
                                      "runs:/{}/model".format(run.info.run_id),
                                      result_type=ArrayType(StringType()))
        with pytest.raises(pyspark.sql.utils.PythonException):
            res = good_data.withColumn("res", udf()).select("res").toPandas()
Esempio n. 17
0
def test_dataframe_from_json():
    source = pd.DataFrame(
        {
            "boolean": [True, False, True],
            "string": ["a", "b", "c"],
            "float": np.array([1.2, 2.3, 3.4], dtype=np.float32),
            "double": np.array([1.2, 2.3, 3.4], dtype=np.float64),
            "integer": np.array([3, 4, 5], dtype=np.int32),
            "long": np.array([3, 4, 5], dtype=np.int64),
            "binary": [bytes([1, 2, 3]),
                       bytes([4, 5]),
                       bytes([6])],
            "date_string": ["2018-02-03", "1996-03-02", "2021-03-05"],
        },
        columns=[
            "boolean",
            "string",
            "float",
            "double",
            "integer",
            "long",
            "binary",
            "date_string",
        ],
    )

    jsonable_df = pd.DataFrame(source, copy=True)
    jsonable_df["binary"] = jsonable_df["binary"].map(base64.b64encode)
    schema = Schema([
        ColSpec("boolean", "boolean"),
        ColSpec("string", "string"),
        ColSpec("float", "float"),
        ColSpec("double", "double"),
        ColSpec("integer", "integer"),
        ColSpec("long", "long"),
        ColSpec("binary", "binary"),
        ColSpec("string", "date_string"),
    ])
    parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"),
                                  pandas_orient="split",
                                  schema=schema)
    assert parsed.equals(source)
    parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"),
                                  pandas_orient="records",
                                  schema=schema)
    assert parsed.equals(source)
    # try parsing with tensor schema
    tensor_schema = Schema([
        TensorSpec(np.dtype("bool"), [-1], "boolean"),
        TensorSpec(np.dtype("str"), [-1], "string"),
        TensorSpec(np.dtype("float32"), [-1], "float"),
        TensorSpec(np.dtype("float64"), [-1], "double"),
        TensorSpec(np.dtype("int32"), [-1], "integer"),
        TensorSpec(np.dtype("int64"), [-1], "long"),
        TensorSpec(np.dtype(bytes), [-1], "binary"),
    ])
    parsed = _dataframe_from_json(jsonable_df.to_json(orient="split"),
                                  pandas_orient="split",
                                  schema=tensor_schema)

    # NB: tensor schema does not automatically decode base64 encoded bytes.
    assert parsed.equals(jsonable_df)
    parsed = _dataframe_from_json(jsonable_df.to_json(orient="records"),
                                  pandas_orient="records",
                                  schema=tensor_schema)

    # NB: tensor schema does not automatically decode base64 encoded bytes.
    assert parsed.equals(jsonable_df)

    # Test parse with TesnorSchema with a single tensor
    tensor_schema = Schema([TensorSpec(np.dtype("float32"), [-1, 3])])
    source = pd.DataFrame(
        {
            "a": np.array([1, 2, 3], dtype=np.float32),
            "b": np.array([4.1, 5.2, 6.3], dtype=np.float32),
            "c": np.array([7, 8, 9], dtype=np.float32),
        },
        columns=["a", "b", "c"],
    )
    assert source.equals(
        _dataframe_from_json(source.to_json(orient="split"),
                             pandas_orient="split",
                             schema=tensor_schema))
    assert source.equals(
        _dataframe_from_json(source.to_json(orient="records"),
                             pandas_orient="records",
                             schema=tensor_schema))