Ejemplo n.º 1
0
def test_spark_schema_inference(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import _parse_datatype_string, StructField, StructType
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
    spark_schema = StructType(
        [StructField(t.name, _parse_datatype_string(t.name), True)
         for t in schema.column_types()])
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types, schema=spark_schema)
    schema = _infer_schema(sparkdf)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
Ejemplo n.º 2
0
def test_spark_type_mapping(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import BooleanType, IntegerType, LongType, FloatType, DoubleType, \
        StringType, BinaryType
    from pyspark.sql.types import StructField, StructType

    assert isinstance(DataType.boolean.to_spark(), BooleanType)
    assert isinstance(DataType.integer.to_spark(), IntegerType)
    assert isinstance(DataType.long.to_spark(), LongType)
    assert isinstance(DataType.float.to_spark(), FloatType)
    assert isinstance(DataType.double.to_spark(), DoubleType)
    assert isinstance(DataType.string.to_spark(), StringType)
    assert isinstance(DataType.binary.to_spark(), BinaryType)
    schema = _infer_schema(pandas_df_with_all_types)
    expected_spark_schema = StructType(
        [StructField(t.name, t.to_spark(), True)
         for t in schema.column_types()])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()
    spark_session = pyspark.sql.SparkSession(pyspark.SparkContext.getOrCreate())
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=actual_spark_schema)
    schema2 = _infer_schema(sparkdf)
    assert schema == schema2

    # test unnamed columns
    schema = Schema([ColSpec(col.type) for col in schema.columns])
    expected_spark_schema = StructType(
        [StructField(str(i), t.to_spark(), True)
         for i, t in enumerate(schema.column_types())])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()

    # test single unnamed column is mapped to just a single spark type
    schema = Schema([ColSpec(DataType.integer)])
    spark_type = schema.as_spark_schema()
    assert isinstance(spark_type, IntegerType)
Ejemplo n.º 3
0
def test_model_log():
    with TempDir(chdr=True) as tmp:
        experiment_id = kiwi.create_experiment("test")
        sig = ModelSignature(inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]),
                             outputs=Schema([ColSpec(name=None, type="double")]))
        input_example = {"x": 1, "y": 2}
        with kiwi.start_run(experiment_id=experiment_id) as r:
            Model.log("some/path", TestFlavor,
                      signature=sig,
                      input_example=input_example)

        local_path = _download_artifact_from_uri("runs:/{}/some/path".format(r.info.run_id),
                                                 output_path=tmp.path(""))
        loaded_model = Model.load(os.path.join(local_path, "MLmodel"))
        assert loaded_model.run_id == r.info.run_id
        assert loaded_model.artifact_path == "some/path"
        assert loaded_model.flavors == {
            "flavor1": {"a": 1, "b": 2},
            "flavor2": {"x": 1, "y": 2},
        }
        assert loaded_model.signature == sig
        path = os.path.join(local_path, loaded_model.saved_input_example_info["artifact_path"])
        x = _dataframe_from_json(path)
        assert x.to_dict(orient="records")[0] == input_example
Ejemplo n.º 4
0
def test_model_save_load():
    m = Model(artifact_path="some/path",
              run_id="123",
              flavors={
                  "flavor1": {"a": 1, "b": 2},
                  "flavor2": {"x": 1, "y": 2},
              },
              signature=ModelSignature(
                  inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]),
                  outputs=Schema([ColSpec(name=None, type="double")])),
              saved_input_example_info={"x": 1, "y": 2})
    assert m.get_input_schema() == m.signature.inputs
    assert m.get_output_schema() == m.signature.outputs
    x = Model(artifact_path="some/other/path", run_id="1234")
    assert x.get_input_schema() is None
    assert x.get_output_schema() is None

    n = Model(artifact_path="some/path",
              run_id="123",
              flavors={
                  "flavor1": {"a": 1, "b": 2},
                  "flavor2": {"x": 1, "y": 2},
              },
              signature=ModelSignature(
                  inputs=Schema([ColSpec("integer", "x"), ColSpec("integer", "y")]),
                  outputs=Schema([ColSpec(name=None, type="double")])),
              saved_input_example_info={"x": 1, "y": 2})
    n.utc_time_created = m.utc_time_created
    assert m == n
    n.signature = None
    assert m != n
    with TempDir() as tmp:
        m.save(tmp.path("model"))
        o = Model.load(tmp.path("model"))
    assert m == o
    assert m.to_json() == o.to_json()
    assert m.to_yaml() == o.to_yaml()
Ejemplo n.º 5
0
def _infer_schema(data: Any) -> Schema:
    """
    Infer an MLflow schema from a dataset.

    This method captures the column names and data types from the user data. The signature
    represents model input and output as data frames with (optionally) named columns and data
    type specified as one of types defined in :py:class:`DataType`. This method will raise
    an exception if the user data contains incompatible types or is not passed in one of the
    supported formats (containers).

    The input should be one of these:
      - pandas.DataFrame or pandas.Series
      - dictionary of { name -> numpy.ndarray}
      - numpy.ndarray
      - pyspark.sql.DataFrame

    The element types should be mappable to one of :py:class:`mlflow.models.signature.DataType`.

    NOTE: Multidimensional (>2d) arrays (aka tensors) are not supported at this time.

    :param data: Dataset to infer from.

    :return: Schema
    """

    if isinstance(data, dict):
        res = []
        for col in data.keys():
            ary = data[col]
            if not isinstance(ary, np.ndarray):
                raise TypeError(
                    "Data in the dictionary must be of type numpy.ndarray")
            dims = len(ary.shape)
            if dims == 1:
                res.append(ColSpec(type=_infer_numpy_array(ary), name=col))
            else:
                raise TensorsNotSupportedException(
                    "Data in the dictionary must be 1-dimensional, "
                    "got shape {}".format(ary.shape))
        return Schema(res)
    elif isinstance(data, pd.Series):
        return Schema([ColSpec(type=_infer_numpy_array(data.values))])
    elif isinstance(data, pd.DataFrame):
        return Schema([
            ColSpec(type=_infer_numpy_array(data[col].values), name=col)
            for col in data.columns
        ])
    elif isinstance(data, np.ndarray):
        if len(data.shape) > 2:
            raise TensorsNotSupportedException(
                "Attempting to infer schema from numpy array with "
                "shape {}".format(data.shape))
        if data.dtype == np.object:
            data = pd.DataFrame(data).infer_objects()
            return Schema([
                ColSpec(type=_infer_numpy_array(data[col].values))
                for col in data.columns
            ])
        if len(data.shape) == 1:
            return Schema([ColSpec(type=_infer_numpy_dtype(data.dtype))])
        elif len(data.shape) == 2:
            return Schema([ColSpec(type=_infer_numpy_dtype(data.dtype))] *
                          data.shape[1])
    elif _is_spark_df(data):
        return Schema([
            ColSpec(type=_infer_spark_type(field.dataType), name=field.name)
            for field in data.schema.fields
        ])
    raise TypeError(
        "Expected one of (pandas.DataFrame, numpy array, "
        "dictionary of (name -> numpy.ndarray), pyspark.sql.DataFrame) "
        "but got '{}'".format(type(data)))
Ejemplo n.º 6
0
def test_schema_inference_on_numpy_array(pandas_df_with_all_types):
    # drop int and float as we lose type size information when storing as objects and defaults are
    # 64b.
    pandas_df_with_all_types = pandas_df_with_all_types.drop(columns=["integer", "float"])
    schema = _infer_schema(pandas_df_with_all_types.values)
    assert schema == Schema([ColSpec(x) for x in pandas_df_with_all_types.columns])

    # test objects
    schema = _infer_schema(np.array(["a"], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.string)])
    schema = _infer_schema(np.array([bytes([1])], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(np.array([bytearray([1]), None], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(np.array([True, None], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.boolean)])
    schema = _infer_schema(np.array([1.1, None], dtype=np.object))
    assert schema == Schema([ColSpec(DataType.double)])

    # test bytes
    schema = _infer_schema(np.array([bytes([1])], dtype=np.bytes_))

    assert schema == Schema([ColSpec(DataType.binary)])
    schema = _infer_schema(np.array([bytearray([1])], dtype=np.bytes_))
    assert schema == Schema([ColSpec(DataType.binary)])

    # test string
    schema = _infer_schema(np.array(["a"], dtype=np.str))
    assert schema == Schema([ColSpec(DataType.string)])

    # test boolean
    schema = _infer_schema(np.array([True], dtype=np.bool))
    assert schema == Schema([ColSpec(DataType.boolean)])

    # test ints
    for t in [np.uint8, np.uint16, np.int8, np.int16, np.int32]:
        schema = _infer_schema(np.array([1, 2, 3], dtype=t))
        assert schema == Schema([ColSpec("integer")])

    # test longs
    for t in [np.uint32, np.int64]:
        schema = _infer_schema(np.array([1, 2, 3], dtype=t))
        assert schema == Schema([ColSpec("long")])

    # unsigned long is unsupported
    with pytest.raises(MlflowException):
        _infer_schema(np.array([1, 2, 3], dtype=np.uint64))

    # test floats
    for t in [np.float16, np.float32]:
        schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=t))
        assert schema == Schema([ColSpec("float")])

    # test doubles
    schema = _infer_schema(np.array([1.1, 2.2, 3.3], dtype=np.float64))
    assert schema == Schema([ColSpec("double")])

    # unsupported
    if hasattr(np, "float128"):
        with pytest.raises(MlflowException):
            _infer_schema(np.array([1, 2, 3], dtype=np.float128))
Ejemplo n.º 7
0
def test_schema_inference_on_dataframe(pandas_df_with_all_types):
    schema = _infer_schema(pandas_df_with_all_types)
    assert schema == Schema([ColSpec(x, x) for x in pandas_df_with_all_types.columns])
Ejemplo n.º 8
0
def test_col_spec():
    a1 = ColSpec("string", "a")
    a2 = ColSpec(DataType.string, "a")
    a3 = ColSpec(DataType.integer, "a")
    assert a1 != a3
    b1 = ColSpec(DataType.string, "b")
    assert b1 != a1
    assert a1 == a2
    with pytest.raises(MlflowException) as ex:
        ColSpec("unsupported")
    assert "Unsupported type 'unsupported'" in ex.value.message
    a4 = ColSpec(**a1.to_dict())
    assert a4 == a1
    assert ColSpec(**json.loads(json.dumps(a1.to_dict()))) == a1
    a5 = ColSpec("string")
    a6 = ColSpec("string", None)
    assert a5 == a6
    assert ColSpec(**json.loads(json.dumps(a5.to_dict()))) == a5