Esempio n. 1
0
def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]:
    """
    Infer the return type from the return type annotation of the given function.

    The returned type class indicates both dtypes (a pandas only dtype object
    or a numpy dtype object) and its corresponding Spark DataType.

    >>> def func() -> int:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.Series[int]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.DataFrame[np.float, str]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> ps.DataFrame[np.float]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> 'int':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.Series[int]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.DataFrame[np.float, str]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> 'ps.DataFrame[np.float]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> ps.DataFrame['a': np.float, 'b': int]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]":
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> def func() -> ps.Series[pdf.b.dtype]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    CategoricalDtype(categories=[3, 4, 5], ordered=False)
    >>> inferred.spark_type
    LongType
    """
    # We should re-import to make sure the class 'SeriesType' is not treated as a class
    # within this module locally. See Series.__class_getitem__ which imports this class
    # canonically.
    from pyspark.pandas.typedef import SeriesType, NameTypeHolder

    spec = getfullargspec(f)
    tpe = spec.annotations.get("return", None)
    if isinstance(tpe, str):
        # This type hint can happen when given hints are string to avoid forward reference.
        tpe = resolve_string_type_hint(tpe)

    if hasattr(tpe, "__origin__") and (
        tpe.__origin__ == ps.DataFrame or tpe.__origin__ == ps.Series
    ):
        # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints.
        tpe = tpe.__args__[0]

    if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
        tpe = tpe.__args__[0]
        if issubclass(tpe, NameTypeHolder):
            tpe = tpe.tpe
        dtype, spark_type = pandas_on_spark_type(tpe)
        return SeriesType(dtype, spark_type)

    # Note that, DataFrame type hints will create a Tuple.
    # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`.
    # Check if the name is Tuple.
    name = getattr(tpe, "_name", getattr(tpe, "__name__", None))
    if name == "Tuple":
        tuple_type = tpe
        if hasattr(tuple_type, "__tuple_params__"):
            # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead.
            # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py
            parameters = getattr(tuple_type, "__tuple_params__")
        else:
            parameters = getattr(tuple_type, "__args__")
        dtypes, spark_types = zip(
            *(
                pandas_on_spark_type(p.tpe)
                if isclass(p) and issubclass(p, NameTypeHolder)
                else pandas_on_spark_type(p)
                for p in parameters
            )
        )
        names = [
            p.name if isclass(p) and issubclass(p, NameTypeHolder) else None for p in parameters
        ]
        return DataFrameType(list(dtypes), list(spark_types), names)

    types = pandas_on_spark_type(tpe)
    if types is None:
        return UnknownType(tpe)
    else:
        return ScalarType(*types)
Esempio n. 2
0
def infer_return_type(
        f: Callable
) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]:
    """
    Infer the return type from the return type annotation of the given function.

    The returned type class indicates both dtypes (a pandas only dtype object
    or a numpy dtype object) and its corresponding Spark DataType.

    >>> def func() -> int:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.Series[int]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.DataFrame[np.float, str]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> ps.DataFrame[np.float]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> 'int':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.Series[int]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.DataFrame[np.float, str]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> 'ps.DataFrame[np.float]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> ps.DataFrame['a': np.float, 'b': int]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]":
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> def func() -> ps.Series[pdf.b.dtype]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    CategoricalDtype(categories=[3, 4, 5], ordered=False)
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.DataFrame[int, [int, int]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]

    >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]

    >>> def func() -> ps.DataFrame[
    ...     ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)),
    ...     [("id", int), ("A", int)]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<index:bigint,id:bigint,A:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=category,struct_field=StructField(index,LongType,true))]

    >>> def func() -> ps.DataFrame[
    ...         (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,a:bigint,b:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]
    """
    # We should re-import to make sure the class 'SeriesType' is not treated as a class
    # within this module locally. See Series.__class_getitem__ which imports this class
    # canonically.
    from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT
    from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder
    from pyspark.pandas.utils import name_like_string

    spec = getfullargspec(f)
    tpe = spec.annotations.get("return", None)
    if isinstance(tpe, str):
        # This type hint can happen when given hints are string to avoid forward reference.
        tpe = resolve_string_type_hint(tpe)

    if hasattr(tpe, "__origin__") and (tpe.__origin__ == ps.DataFrame
                                       or tpe.__origin__ == ps.Series):
        # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints.
        tpe = tpe.__args__[0]

    if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
        tpe = tpe.__args__[0]
        if issubclass(tpe, NameTypeHolder):
            tpe = tpe.tpe
        dtype, spark_type = pandas_on_spark_type(tpe)
        return SeriesType(dtype, spark_type)

    # Note that, DataFrame type hints will create a Tuple.
    # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`.
    # Check if the name is Tuple.
    name = getattr(tpe, "_name", getattr(tpe, "__name__", None))
    if name == "Tuple":
        tuple_type = tpe
        if hasattr(tuple_type, "__tuple_params__"):
            # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead.
            # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py
            parameters = getattr(tuple_type, "__tuple_params__")
        else:
            parameters = getattr(tuple_type, "__args__")

        index_parameters = [
            p for p in parameters
            if isclass(p) and issubclass(p, IndexNameTypeHolder)
        ]
        data_parameters = [p for p in parameters if p not in index_parameters]
        assert len(
            data_parameters) > 0, "Type hints for data must not be empty."

        index_fields = []
        if len(index_parameters) >= 1:
            for level, index_parameter in enumerate(index_parameters):
                index_name = index_parameter.name
                index_dtype, index_spark_type = pandas_on_spark_type(
                    index_parameter.tpe)
                index_fields.append(
                    InternalField(
                        dtype=index_dtype,
                        struct_field=types.StructField(
                            name=index_name if index_name is not None else
                            SPARK_INDEX_NAME_FORMAT(level),
                            dataType=index_spark_type,
                        ),
                    ))
        else:
            # No type hint for index.
            assert len(index_parameters) == 0

        data_dtypes, data_spark_types = zip(
            *(pandas_on_spark_type(p.tpe) if isclass(p)
              and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p)
              for p in data_parameters))
        data_names = [
            p.name if isclass(p) and issubclass(p, NameTypeHolder) else None
            for p in data_parameters
        ]
        data_fields = []
        for i, (data_name, data_dtype, data_spark_type) in enumerate(
                zip(data_names, data_dtypes, data_spark_types)):
            data_fields.append(
                InternalField(
                    dtype=data_dtype,
                    struct_field=types.StructField(
                        name=name_like_string(data_name)
                        if data_name is not None else ("c%s" % i),
                        dataType=data_spark_type,
                    ),
                ))

        return DataFrameType(index_fields=index_fields,
                             data_fields=data_fields)

    tpes = pandas_on_spark_type(tpe)
    if tpes is None:
        return UnknownType(tpe)
    else:
        return ScalarType(*tpes)