Ejemplo n.º 1
0
def test_spark_type_mapping(pandas_df_with_all_types):
    import pyspark
    from pyspark.sql.types import (
        BooleanType,
        IntegerType,
        LongType,
        FloatType,
        DoubleType,
        StringType,
        BinaryType,
        TimestampType,
    )
    from pyspark.sql.types import StructField, StructType

    assert isinstance(DataType.boolean.to_spark(), BooleanType)
    assert isinstance(DataType.integer.to_spark(), IntegerType)
    assert isinstance(DataType.long.to_spark(), LongType)
    assert isinstance(DataType.float.to_spark(), FloatType)
    assert isinstance(DataType.double.to_spark(), DoubleType)
    assert isinstance(DataType.string.to_spark(), StringType)
    assert isinstance(DataType.binary.to_spark(), BinaryType)
    assert isinstance(DataType.datetime.to_spark(), TimestampType)
    pandas_df_with_all_types = pandas_df_with_all_types.drop(
        columns=["boolean_ext", "integer_ext", "string_ext"])
    schema = _infer_schema(pandas_df_with_all_types)
    expected_spark_schema = StructType([
        StructField(t.name, t.to_spark(), True) for t in schema.input_types()
    ])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()
    spark_session = pyspark.sql.SparkSession(
        pyspark.SparkContext.getOrCreate())
    sparkdf = spark_session.createDataFrame(pandas_df_with_all_types,
                                            schema=actual_spark_schema)
    schema2 = _infer_schema(sparkdf)
    assert schema == schema2

    # test unnamed columns
    schema = Schema([ColSpec(col.type) for col in schema.inputs])
    expected_spark_schema = StructType([
        StructField(str(i), t.to_spark(), True)
        for i, t in enumerate(schema.input_types())
    ])
    actual_spark_schema = schema.as_spark_schema()
    assert expected_spark_schema.jsonValue() == actual_spark_schema.jsonValue()

    # test single unnamed column is mapped to just a single spark type
    schema = Schema([ColSpec(DataType.integer)])
    spark_type = schema.as_spark_schema()
    assert isinstance(spark_type, IntegerType)
Ejemplo n.º 2
0
def _infer_schema(data: Any) -> Schema:
    """
    Infer an MLflow schema from a dataset.

    Data inputted as a numpy array or a dictionary is represented by :py:class:`TensorSpec`.
    All other inputted data types are specified by :py:class:`ColSpec`.

    A `TensorSpec` captures the data shape (default variable axis is 0), the data type (numpy.dtype)
    and an optional name for each individual tensor of the dataset.
    A `ColSpec` captures the data type (defined in :py:class:`DataType`) and an optional name for
    each individual column of the dataset.

    This method will raise an exception if the user data contains incompatible types or is not
    passed in one of the supported formats (containers).

    The input should be one of these:
      - pandas.DataFrame or pandas.Series
      - dictionary of { name -> numpy.ndarray}
      - numpy.ndarray
      - pyspark.sql.DataFrame
      - csc/csr matrix

    The element types should be mappable to one of :py:class:`mlflow.models.signature.DataType` for
    dataframes and to one of numpy types for tensors.

    :param data: Dataset to infer from.

    :return: Schema
    """
    from scipy.sparse import csr_matrix, csc_matrix

    if isinstance(data, dict):
        res = []
        for name in data.keys():
            ndarray = data[name]
            if not isinstance(ndarray, np.ndarray):
                raise TypeError("Data in the dictionary must be of type numpy.ndarray")
            res.append(
                TensorSpec(
                    type=clean_tensor_type(ndarray.dtype),
                    shape=_get_tensor_shape(ndarray),
                    name=name,
                )
            )
        schema = Schema(res)
    elif isinstance(data, pd.Series):
        schema = Schema([ColSpec(type=_infer_pandas_column(data))])
    elif isinstance(data, pd.DataFrame):
        schema = Schema(
            [ColSpec(type=_infer_pandas_column(data[col]), name=col) for col in data.columns]
        )
    elif isinstance(data, np.ndarray):
        schema = Schema(
            [TensorSpec(type=clean_tensor_type(data.dtype), shape=_get_tensor_shape(data))]
        )
    elif isinstance(data, (csc_matrix, csr_matrix)):
        schema = Schema(
            [TensorSpec(type=clean_tensor_type(data.data.dtype), shape=_get_tensor_shape(data))]
        )
    elif _is_spark_df(data):
        schema = Schema(
            [
                ColSpec(type=_infer_spark_type(field.dataType), name=field.name)
                for field in data.schema.fields
            ]
        )
    else:
        raise TypeError(
            "Expected one of (pandas.DataFrame, numpy array, "
            "dictionary of (name -> numpy.ndarray), pyspark.sql.DataFrame) "
            "but got '{}'".format(type(data))
        )
    if not schema.is_tensor_spec() and any(
        [t in (DataType.integer, DataType.long) for t in schema.input_types()]
    ):
        warnings.warn(
            "Hint: Inferred schema contains integer column(s). Integer columns in "
            "Python cannot represent missing values. If your input data contains "
            "missing values at inference time, it will be encoded as floats and will "
            "cause a schema enforcement error. The best way to avoid this problem is "
            "to infer the model schema based on a realistic data sample (training "
            "dataset) that includes missing values. Alternatively, you can declare "
            "integer columns as doubles (float64) whenever these columns may have "
            "missing values. See `Handling Integers With Missing Values "
            "<https://www.mlflow.org/docs/latest/models.html#"
            "handling-integers-with-missing-values>`_ for more details.",
            stacklevel=2,
        )
    return schema