Example #1
0
    def astype(self, index_ops: T_IndexOps, dtype: Union[str, type,
                                                         Dtype]) -> T_IndexOps:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return _as_bool_type(index_ops, dtype)
        elif isinstance(spark_type, StringType):
            if isinstance(dtype, extension_dtypes):
                # seems like a pandas' bug?
                scol = F.when(index_ops.spark.column.isNull(),
                              str(pd.NaT)).otherwise(
                                  index_ops.spark.column.cast(spark_type))
            else:
                null_str = str(pd.NaT)
                casted = index_ops.spark.column.cast(spark_type)
                scol = F.when(index_ops.spark.column.isNull(),
                              null_str).otherwise(casted)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Example #2
0
    def astype(self, index_ops: IndexOpsLike,
               dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            return _as_bool_type(index_ops, dtype)
        elif isinstance(spark_type, StringType):
            if isinstance(dtype, extension_dtypes):
                scol = F.when(
                    index_ops.spark.column.isNotNull(),
                    F.when(index_ops.spark.column, "True").otherwise("False"),
                )
            else:
                null_str = str(None)
                casted = F.when(index_ops.spark.column,
                                "True").otherwise("False")
                scol = F.when(index_ops.spark.column.isNull(),
                              null_str).otherwise(casted)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Example #3
0
    def codes(self) -> "ps.Series":
        """
        Return Series of codes as well as the index.

        Examples
        --------
        >>> s = ps.Series(list("abbccc"), dtype="category")
        >>> s  # doctest: +SKIP
        0    a
        1    b
        2    b
        3    c
        4    c
        5    c
        dtype: category
        Categories (3, object): ['a', 'b', 'c']

        >>> s.cat.codes
        0    0
        1    1
        2    1
        3    2
        4    2
        5    2
        dtype: int8
        """
        return self._data._with_new_scol(
            self._data.spark.column,
            field=InternalField.from_struct_field(
                StructField(
                    name=self._data._internal.data_spark_column_names[0],
                    dataType=self._data.spark.data_type,
                    nullable=self._data.spark.nullable,
                )),
        ).rename()
Example #4
0
    def astype(self, index_ops: T_IndexOps, dtype: Union[str, type,
                                                         Dtype]) -> T_IndexOps:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                if isinstance(index_ops.spark.data_type,
                              (FloatType, DoubleType)):
                    scol = F.when(
                        index_ops.spark.column.isNull()
                        | F.isnan(index_ops.spark.column),
                        F.lit(True),
                    ).otherwise(index_ops.spark.column.cast(spark_type))
                else:  # DecimalType
                    scol = F.when(index_ops.spark.column.isNull(),
                                  F.lit(False)).otherwise(
                                      index_ops.spark.column.cast(spark_type))
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype, null_str=str(np.nan))
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Example #5
0
    def _transform_batch(
            self, func: Callable[..., pd.Series],
            return_type: Optional[Union[SeriesType, ScalarType]]) -> "Series":
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.series import Series, first_series
        from pyspark import pandas as ps

        if not isinstance(func, FunctionType):
            f = func
            func = lambda *args, **kwargs: f(*args, **kwargs)

        if return_type is None:
            # TODO: In this case, it avoids the shortcut for now (but only infers schema)
            #  because it returns a series from a different DataFrame and it has a different
            #  anchor. We should fix this to allow the shortcut or only allow to infer
            #  schema.
            limit = ps.get_option("compute.shortcut_limit")
            pser = self._psser.head(limit + 1)._to_internal_pandas()
            transformed = pser.transform(func)
            psser = Series(transformed)  # type: Series

            field = psser._internal.data_fields[0].normalize_spark_type()
        else:
            spark_return_type = return_type.spark_type
            dtype = return_type.dtype
            field = InternalField(
                dtype=dtype,
                struct_field=StructField(
                    name=self._psser._internal.data_spark_column_names[0],
                    dataType=spark_return_type,
                ),
            )

        psdf = self._psser.to_frame()
        columns = psdf._internal.spark_column_names

        def pandas_concat(*series: pd.Series) -> pd.DataFrame:
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf.columns = columns
            return pdf

        def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
            return func(first_series(pdf)).to_frame()

        return_schema = StructType(
            [StructField(SPARK_DEFAULT_SERIES_NAME, field.spark_type)])
        output_func = GroupBy._make_pandas_df_builder_func(psdf,
                                                           apply_func,
                                                           return_schema,
                                                           retain_index=False)

        @pandas_udf(returnType=field.spark_type)  # type: ignore
        def pudf(*series: pd.Series) -> pd.Series:
            return first_series(output_func(pandas_concat(*series)))

        return self._psser._with_new_scol(
            scol=pudf(*psdf._internal.spark_columns).alias(field.name),
            field=field)
Example #6
0
    def codes(self) -> Index:
        """
        The category codes of this categorical.

        Codes are an Index of integers which are the positions of the actual
        values in the categories Index.

        There is no setter, use the other categorical methods and the normal item
        setter to change values in the categorical.

        Returns
        -------
        Index
            A non-writable view of the `codes` Index.

        Examples
        --------
        >>> idx = ps.CategoricalIndex(list("abbccc"))
        >>> idx  # doctest: +NORMALIZE_WHITESPACE
        CategoricalIndex(['a', 'b', 'b', 'c', 'c', 'c'],
                         categories=['a', 'b', 'c'], ordered=False, dtype='category')

        >>> idx.codes
        Int64Index([0, 1, 1, 2, 2, 2], dtype='int64')
        """
        return self._with_new_scol(
            self.spark.column,
            field=InternalField.from_struct_field(
                StructField(
                    name=self._internal.index_spark_column_names[0],
                    dataType=self.spark.data_type,
                    nullable=self.spark.nullable,
                )),
        ).rename(None)
Example #7
0
def _as_bool_type(index_ops: T_IndexOps, dtype: Union[str, type,
                                                      Dtype]) -> T_IndexOps:
    """Cast `index_ops` to BooleanType Spark type, given `dtype`."""
    from pyspark.pandas.internal import InternalField

    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(BooleanType())
    else:
        scol = F.when(index_ops.spark.column.isNull(), F.lit(False)).otherwise(
            index_ops.spark.column.cast(BooleanType()))
    return index_ops._with_new_scol(
        scol.alias(index_ops._internal.data_spark_column_names[0]),
        field=InternalField(dtype=dtype),
    )
Example #8
0
    def astype(self, index_ops: IndexOpsLike, dtype: Union[str, type, Dtype]) -> IndexOpsLike:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)
        elif isinstance(spark_type, NumericType):
            from pyspark.pandas.internal import InternalField

            scol = self._cast_spark_column_timestamp_to_long(index_ops.spark.column).cast(
                spark_type
            )
            return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype))
        else:
            return super(DatetimeNTZOps, self).astype(index_ops, dtype)
Example #9
0
def _as_other_type(index_ops: IndexOpsLike, dtype: Union[str, type, Dtype],
                   spark_type: DataType) -> IndexOpsLike:
    """Cast `index_ops` to a `dtype` (`spark_type`) that needs no pre-processing.

    Destination types that need pre-processing: CategoricalDtype, BooleanType, and StringType.
    """
    from pyspark.pandas.internal import InternalField

    need_pre_process = (isinstance(dtype, CategoricalDtype)
                        or isinstance(spark_type, BooleanType)
                        or isinstance(spark_type, StringType))
    assert not need_pre_process, "Pre-processing is needed before the type casting."

    scol = index_ops.spark.column.cast(spark_type)
    return index_ops._with_new_scol(scol, field=InternalField(dtype=dtype))
Example #10
0
    def __init__(
        self, dtypes: List[Dtype], spark_types: List[types.DataType], names: List[Optional[str]]
    ):
        from pyspark.pandas.internal import InternalField
        from pyspark.pandas.utils import name_like_string

        self.fields = [
            InternalField(
                dtype=dtype,
                struct_field=types.StructField(
                    name=(name_like_string(name) if name is not None else ("c%s" % i)),
                    dataType=spark_type,
                ),
            )
            for i, (name, dtype, spark_type) in enumerate(zip(names, dtypes, spark_types))
        ]
Example #11
0
def _as_string_type(index_ops: T_IndexOps,
                    dtype: Union[str, type, Dtype],
                    *,
                    null_str: str = str(None)) -> T_IndexOps:
    """Cast `index_ops` to StringType Spark type, given `dtype` and `null_str`,
    representing null Spark column.
    """
    from pyspark.pandas.internal import InternalField

    if isinstance(dtype, extension_dtypes):
        scol = index_ops.spark.column.cast(StringType())
    else:
        casted = index_ops.spark.column.cast(StringType())
        scol = F.when(index_ops.spark.column.isNull(),
                      null_str).otherwise(casted)
    return index_ops._with_new_scol(
        scol.alias(index_ops._internal.data_spark_column_names[0]),
        field=InternalField(dtype=dtype),
    )
Example #12
0
    def astype(self, index_ops: Union["Index", "Series"],
               dtype: Union[str, type, Dtype]) -> Union["Index", "Series"]:
        dtype, spark_type = pandas_on_spark_type(dtype)

        if isinstance(dtype, CategoricalDtype):
            return _as_categorical_type(index_ops, dtype, spark_type)

        if isinstance(spark_type, BooleanType):
            if isinstance(dtype, extension_dtypes):
                scol = index_ops.spark.column.cast(spark_type)
            else:
                scol = F.when(index_ops.spark.column.isNull(),
                              F.lit(False)).otherwise(
                                  F.length(index_ops.spark.column) > 0)
            return index_ops._with_new_scol(
                scol.alias(index_ops._internal.data_spark_column_names[0]),
                field=InternalField(dtype=dtype),
            )
        elif isinstance(spark_type, StringType):
            return _as_string_type(index_ops, dtype)
        else:
            return _as_other_type(index_ops, dtype, spark_type)
Example #13
0
    def _downsample(self, f: str) -> DataFrame:
        """
        Downsample the defined function.

        Parameters
        ----------
        how : string / mapped function
        **kwargs : kw args passed to how function
        """

        # a simple example to illustrate the computation:
        #   dates = [
        #         datetime.datetime(2012, 1, 2),
        #         datetime.datetime(2012, 5, 3),
        #         datetime.datetime(2022, 5, 3),
        #   ]
        #   index = pd.DatetimeIndex(dates)
        #   pdf = pd.DataFrame(np.array([1,2,3]), index=index, columns=['A'])
        #   pdf.resample('3Y').max()
        #                 A
        #   2012-12-31  2.0
        #   2015-12-31  NaN
        #   2018-12-31  NaN
        #   2021-12-31  NaN
        #   2024-12-31  3.0
        #
        # in this case:
        # 1, obtain one origin point to bin all timestamps, we can get one (2009-12-31)
        # from the minimum timestamp (2012-01-02);
        # 2, the default intervals for 'Y' are right-closed, so intervals are:
        # (2009-12-31, 2012-12-31], (2012-12-31, 2015-12-31], (2015-12-31, 2018-12-31], ...
        # 3, bin all timestamps, for example, 2022-05-03 belongs to interval
        # (2021-12-31, 2024-12-31], since the default label is 'right', label it with the right
        # edge 2024-12-31;
        # 4, some intervals maybe too large for this down sampling, so we need to pad the dataframe
        # to avoid missing some results, like: 2015-12-31, 2018-12-31 and 2021-12-31;
        # 5, union the binned dataframe and padded dataframe, and apply aggregation 'max' to get
        # the final results;

        # one action to obtain the range, in the future we may cache it in the index.
        ts_min, ts_max = (self._psdf._internal.spark_frame.select(
            F.min(self._resamplekey_scol),
            F.max(self._resamplekey_scol)).toPandas().iloc[0])

        # the logic to obtain an origin point to bin the timestamps is too complex to follow,
        # here just use Pandas' resample on a 1-length series to get it.
        ts_origin = (pd.Series([0],
                               index=[ts_min
                                      ]).resample(rule=self._offset.freqstr,
                                                  closed=self._closed,
                                                  label="left").sum().index[0])
        assert ts_origin <= ts_min

        bin_col_name = "__tmp_resample_bin_col__"
        bin_col_label = verify_temp_column_name(self._psdf, bin_col_name)
        bin_col_field = InternalField(
            dtype=np.dtype("datetime64[ns]"),
            struct_field=StructField(bin_col_name, TimestampType(), True),
        )
        bin_scol = self._bin_time_stamp(
            ts_origin,
            self._resamplekey_scol,
        )

        agg_columns = [
            psser for psser in self._agg_columns
            if (isinstance(psser.spark.data_type, NumericType))
        ]
        assert len(agg_columns) > 0

        # in the binning side, label the timestamps according to the origin and the freq(rule)
        bin_sdf = self._psdf._internal.spark_frame.select(
            F.col(SPARK_DEFAULT_INDEX_NAME),
            bin_scol.alias(bin_col_name),
            *[psser.spark.column for psser in agg_columns],
        )

        # in the padding side, insert necessary points
        # again, directly apply Pandas' resample on a 2-length series to obtain the indices
        pad_sdf = (ps.from_pandas(
            pd.Series([0, 0], index=[ts_min, ts_max]).resample(
                rule=self._offset.freqstr,
                closed=self._closed,
                label=self._label).sum().index)._internal.spark_frame.select(
                    F.col(SPARK_DEFAULT_INDEX_NAME).alias(bin_col_name)).where(
                        (ts_min <= F.col(bin_col_name))
                        & (F.col(bin_col_name) <= ts_max)))

        # union the above two spark dataframes.
        sdf = bin_sdf.unionByName(
            pad_sdf,
            allowMissingColumns=True).where(~F.isnull(F.col(bin_col_name)))

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)],
            data_spark_columns=[F.col(bin_col_name)] + [
                scol_for(sdf, psser._internal.data_spark_column_names[0])
                for psser in agg_columns
            ],
            column_labels=[bin_col_label] +
            [psser._column_label for psser in agg_columns],
            data_fields=[bin_col_field] + [
                psser._internal.data_fields[0].copy(nullable=True)
                for psser in agg_columns
            ],
            column_label_names=self._psdf._internal.column_label_names,
        )
        psdf: DataFrame = DataFrame(internal)

        groupby = psdf.groupby(psdf._psser_for(bin_col_label), dropna=False)
        downsampled = getattr(groupby, f)()
        downsampled.index.name = None

        return downsampled
Example #14
0
def infer_return_type(
        f: Callable
) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]:
    """
    Infer the return type from the return type annotation of the given function.

    The returned type class indicates both dtypes (a pandas only dtype object
    or a numpy dtype object) and its corresponding Spark DataType.

    >>> def func() -> int:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.Series[int]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.DataFrame[np.float, str]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> ps.DataFrame[np.float]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> 'int':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.Series[int]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType

    >>> def func() -> 'ps.DataFrame[np.float, str]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))

    >>> def func() -> 'ps.DataFrame[np.float]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,DoubleType,true)))

    >>> def func() -> ps.DataFrame['a': np.float, 'b': int]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]":
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true)))

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))

    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))

    >>> def func() -> ps.Series[pdf.b.dtype]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    CategoricalDtype(categories=[3, 4, 5], ordered=False)
    >>> inferred.spark_type
    LongType

    >>> def func() -> ps.DataFrame[int, [int, int]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]

    >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]

    >>> def func() -> ps.DataFrame[
    ...     ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)),
    ...     [("id", int), ("A", int)]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<index:bigint,id:bigint,A:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=category,struct_field=StructField(index,LongType,true))]

    >>> def func() -> ps.DataFrame[
    ...         (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,a:bigint,b:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]
    """
    # We should re-import to make sure the class 'SeriesType' is not treated as a class
    # within this module locally. See Series.__class_getitem__ which imports this class
    # canonically.
    from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT
    from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder
    from pyspark.pandas.utils import name_like_string

    spec = getfullargspec(f)
    tpe = spec.annotations.get("return", None)
    if isinstance(tpe, str):
        # This type hint can happen when given hints are string to avoid forward reference.
        tpe = resolve_string_type_hint(tpe)

    if hasattr(tpe, "__origin__") and (tpe.__origin__ == ps.DataFrame
                                       or tpe.__origin__ == ps.Series):
        # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints.
        tpe = tpe.__args__[0]

    if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
        tpe = tpe.__args__[0]
        if issubclass(tpe, NameTypeHolder):
            tpe = tpe.tpe
        dtype, spark_type = pandas_on_spark_type(tpe)
        return SeriesType(dtype, spark_type)

    # Note that, DataFrame type hints will create a Tuple.
    # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`.
    # Check if the name is Tuple.
    name = getattr(tpe, "_name", getattr(tpe, "__name__", None))
    if name == "Tuple":
        tuple_type = tpe
        if hasattr(tuple_type, "__tuple_params__"):
            # Python 3.5.0 to 3.5.2 has '__tuple_params__' instead.
            # See https://github.com/python/cpython/blob/v3.5.2/Lib/typing.py
            parameters = getattr(tuple_type, "__tuple_params__")
        else:
            parameters = getattr(tuple_type, "__args__")

        index_parameters = [
            p for p in parameters
            if isclass(p) and issubclass(p, IndexNameTypeHolder)
        ]
        data_parameters = [p for p in parameters if p not in index_parameters]
        assert len(
            data_parameters) > 0, "Type hints for data must not be empty."

        index_fields = []
        if len(index_parameters) >= 1:
            for level, index_parameter in enumerate(index_parameters):
                index_name = index_parameter.name
                index_dtype, index_spark_type = pandas_on_spark_type(
                    index_parameter.tpe)
                index_fields.append(
                    InternalField(
                        dtype=index_dtype,
                        struct_field=types.StructField(
                            name=index_name if index_name is not None else
                            SPARK_INDEX_NAME_FORMAT(level),
                            dataType=index_spark_type,
                        ),
                    ))
        else:
            # No type hint for index.
            assert len(index_parameters) == 0

        data_dtypes, data_spark_types = zip(
            *(pandas_on_spark_type(p.tpe) if isclass(p)
              and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p)
              for p in data_parameters))
        data_names = [
            p.name if isclass(p) and issubclass(p, NameTypeHolder) else None
            for p in data_parameters
        ]
        data_fields = []
        for i, (data_name, data_dtype, data_spark_type) in enumerate(
                zip(data_names, data_dtypes, data_spark_types)):
            data_fields.append(
                InternalField(
                    dtype=data_dtype,
                    struct_field=types.StructField(
                        name=name_like_string(data_name)
                        if data_name is not None else ("c%s" % i),
                        dataType=data_spark_type,
                    ),
                ))

        return DataFrameType(index_fields=index_fields,
                             data_fields=data_fields)

    tpes = pandas_on_spark_type(tpe)
    if tpes is None:
        return UnknownType(tpe)
    else:
        return ScalarType(*tpes)
Example #15
0
File: utils.py Project: jerqi/spark
def combine_frames(
    this: "DataFrame",
    *args: DataFrameOrSeries,
    how: str = "full",
    preserve_order_column: bool = False
) -> "DataFrame":
    """
    This method combines `this` DataFrame with a different `that` DataFrame or
    Series from a different DataFrame.

    It returns a DataFrame that has prefix `this_` and `that_` to distinct
    the columns names from both DataFrames

    It internally performs a join operation which can be expensive in general.
    So, if `compute.ops_on_diff_frames` option is False,
    this method throws an exception.
    """
    from pyspark.pandas.config import get_option
    from pyspark.pandas.frame import DataFrame
    from pyspark.pandas.internal import (
        InternalField,
        InternalFrame,
        HIDDEN_COLUMNS,
        NATURAL_ORDER_COLUMN_NAME,
        SPARK_INDEX_NAME_FORMAT,
    )
    from pyspark.pandas.series import Series

    if all(isinstance(arg, Series) for arg in args):
        assert all(
            same_anchor(arg, args[0]) for arg in args
        ), "Currently only one different DataFrame (from given Series) is supported"
        assert not same_anchor(this, args[0]), "We don't need to combine. All series is in this."
        that = args[0]._psdf[list(args)]
    elif len(args) == 1 and isinstance(args[0], DataFrame):
        assert isinstance(args[0], DataFrame)
        assert not same_anchor(
            this, args[0]
        ), "We don't need to combine. `this` and `that` are same."
        that = args[0]
    else:
        raise AssertionError("args should be single DataFrame or " "single/multiple Series")

    if get_option("compute.ops_on_diff_frames"):

        def resolve(internal: InternalFrame, side: str) -> InternalFrame:
            rename = lambda col: "__{}_{}".format(side, col)
            internal = internal.resolved_copy
            sdf = internal.spark_frame
            sdf = internal.spark_frame.select(
                *[
                    scol_for(sdf, col).alias(rename(col))
                    for col in sdf.columns
                    if col not in HIDDEN_COLUMNS
                ],
                *HIDDEN_COLUMNS
            )
            return internal.copy(
                spark_frame=sdf,
                index_spark_columns=[
                    scol_for(sdf, rename(col)) for col in internal.index_spark_column_names
                ],
                index_fields=[
                    field.copy(name=rename(field.name)) for field in internal.index_fields
                ],
                data_spark_columns=[
                    scol_for(sdf, rename(col)) for col in internal.data_spark_column_names
                ],
                data_fields=[field.copy(name=rename(field.name)) for field in internal.data_fields],
            )

        this_internal = resolve(this._internal, "this")
        that_internal = resolve(that._internal, "that")

        this_index_map = list(
            zip(
                this_internal.index_spark_column_names,
                this_internal.index_names,
                this_internal.index_fields,
            )
        )
        that_index_map = list(
            zip(
                that_internal.index_spark_column_names,
                that_internal.index_names,
                that_internal.index_fields,
            )
        )
        assert len(this_index_map) == len(that_index_map)

        join_scols = []
        merged_index_scols = []

        # Note that the order of each element in index_map is guaranteed according to the index
        # level.
        this_and_that_index_map = list(zip(this_index_map, that_index_map))

        this_sdf = this_internal.spark_frame.alias("this")
        that_sdf = that_internal.spark_frame.alias("that")

        # If the same named index is found, that's used.
        index_column_names = []
        index_use_extension_dtypes = []
        for (
            i,
            ((this_column, this_name, this_field), (that_column, that_name, that_field)),
        ) in enumerate(this_and_that_index_map):
            if this_name == that_name:
                # We should merge the Spark columns into one
                # to mimic pandas' behavior.
                this_scol = scol_for(this_sdf, this_column)
                that_scol = scol_for(that_sdf, that_column)
                join_scol = this_scol == that_scol
                join_scols.append(join_scol)

                column_name = SPARK_INDEX_NAME_FORMAT(i)
                index_column_names.append(column_name)
                index_use_extension_dtypes.append(
                    any(field.is_extension_dtype for field in [this_field, that_field])
                )
                merged_index_scols.append(
                    F.when(this_scol.isNotNull(), this_scol).otherwise(that_scol).alias(column_name)
                )
            else:
                raise ValueError("Index names must be exactly matched currently.")

        assert len(join_scols) > 0, "cannot join with no overlapping index names"

        joined_df = this_sdf.join(that_sdf, on=join_scols, how=how)

        if preserve_order_column:
            order_column = [scol_for(this_sdf, NATURAL_ORDER_COLUMN_NAME)]
        else:
            order_column = []

        joined_df = joined_df.select(
            *merged_index_scols,
            *(
                scol_for(this_sdf, this_internal.spark_column_name_for(label))
                for label in this_internal.column_labels
            ),
            *(
                scol_for(that_sdf, that_internal.spark_column_name_for(label))
                for label in that_internal.column_labels
            ),
            *order_column
        )

        index_spark_columns = [scol_for(joined_df, col) for col in index_column_names]

        index_columns = set(index_column_names)
        new_data_columns = [
            col
            for col in joined_df.columns
            if col not in index_columns and col != NATURAL_ORDER_COLUMN_NAME
        ]

        schema = joined_df.select(*index_spark_columns, *new_data_columns).schema

        index_fields = [
            InternalField.from_struct_field(struct_field, use_extension_dtypes=use_extension_dtypes)
            for struct_field, use_extension_dtypes in zip(
                schema.fields[: len(index_spark_columns)], index_use_extension_dtypes
            )
        ]
        data_fields = [
            InternalField.from_struct_field(
                struct_field, use_extension_dtypes=field.is_extension_dtype
            )
            for struct_field, field in zip(
                schema.fields[len(index_spark_columns) :],
                this_internal.data_fields + that_internal.data_fields,
            )
        ]

        level = max(this_internal.column_labels_level, that_internal.column_labels_level)

        def fill_label(label: Optional[Tuple]) -> List:
            if label is None:
                return ([""] * (level - 1)) + [None]
            else:
                return ([""] * (level - len(label))) + list(label)

        column_labels = [
            tuple(["this"] + fill_label(label)) for label in this_internal.column_labels
        ] + [tuple(["that"] + fill_label(label)) for label in that_internal.column_labels]
        column_label_names = (
            cast(List[Optional[Tuple]], [None]) * (1 + level - this_internal.column_labels_level)
        ) + this_internal.column_label_names
        return DataFrame(
            InternalFrame(
                spark_frame=joined_df,
                index_spark_columns=index_spark_columns,
                index_names=this_internal.index_names,
                index_fields=index_fields,
                column_labels=column_labels,
                data_spark_columns=[scol_for(joined_df, col) for col in new_data_columns],
                data_fields=data_fields,
                column_label_names=column_label_names,
            )
        )
    else:
        raise ValueError(ERROR_MESSAGE_CANNOT_COMBINE)
Example #16
0
def infer_return_type(
        f: Callable
) -> Union[SeriesType, DataFrameType, ScalarType, UnknownType]:
    """
    Infer the return type from the return type annotation of the given function.

    The returned type class indicates both dtypes (a pandas only dtype object
    or a numpy dtype object) and its corresponding Spark DataType.

    >>> def func() -> int:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> ps.Series[int]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> ps.DataFrame[np.float, str]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)])

    >>> def func() -> ps.DataFrame[np.float]:
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True)])

    >>> def func() -> 'int':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> 'ps.Series[int]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    dtype('int64')
    >>> inferred.spark_type
    LongType()

    >>> def func() -> 'ps.DataFrame[np.float, str]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('<U')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)])

    >>> def func() -> 'ps.DataFrame[np.float]':
    ...    pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64')]
    >>> inferred.spark_type
    StructType([StructField('c0', DoubleType(), True)])

    >>> def func() -> ps.DataFrame['a': np.float, 'b': int]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)])

    >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]":
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('float64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)])

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)])

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)])

    >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})
    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64')]
    >>> inferred.spark_type
    StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)])

    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
    >>> def func() -> ps.DataFrame[pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)])

    >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type
    StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)])

    >>> def func() -> ps.Series[pdf.b.dtype]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtype
    CategoricalDtype(categories=[3, 4, 5], ordered=False)
    >>> inferred.spark_type
    LongType()

    >>> def func() -> ps.DataFrame[int, [int, int]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))]

    >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))]

    >>> def func() -> ps.DataFrame[
    ...     ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)),
    ...     [("id", int), ("A", int)]]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [CategoricalDtype(categories=[3, 4, 5], ordered=False), dtype('int64'), dtype('int64')]
    >>> inferred.spark_type.simpleString()
    'struct<index:bigint,id:bigint,A:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=category, struct_field=StructField('index', LongType(), True))]

    >>> def func() -> ps.DataFrame[
    ...         (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:
    ...     pass
    >>> inferred = infer_return_type(func)
    >>> inferred.dtypes
    [dtype('int64'), dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
    >>> inferred.spark_type.simpleString()
    'struct<__index_level_0__:bigint,a:bigint,b:bigint>'
    >>> inferred.index_fields
    [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))]
    """
    # We should re-import to make sure the class 'SeriesType' is not treated as a class
    # within this module locally. See Series.__class_getitem__ which imports this class
    # canonically.
    from pyspark.pandas.internal import InternalField, SPARK_INDEX_NAME_FORMAT
    from pyspark.pandas.typedef import SeriesType, NameTypeHolder, IndexNameTypeHolder
    from pyspark.pandas.utils import name_like_string

    tpe = get_type_hints(f).get("return", None)

    if tpe is None:
        raise ValueError("A return value is required for the input function")

    if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
        tpe = tpe.__args__[0]
        if issubclass(tpe, NameTypeHolder):
            tpe = tpe.tpe
        dtype, spark_type = pandas_on_spark_type(tpe)
        return SeriesType(dtype, spark_type)

    # Note that, DataFrame type hints will create a Tuple.
    # Tuple has _name but other types have __name__
    name = getattr(tpe, "_name", getattr(tpe, "__name__", None))
    # Check if the name is Tuple.
    if name == "Tuple":
        tuple_type = tpe
        parameters = getattr(tuple_type, "__args__")

        index_parameters = [
            p for p in parameters
            if isclass(p) and issubclass(p, IndexNameTypeHolder)
        ]
        data_parameters = [p for p in parameters if p not in index_parameters]
        assert len(
            data_parameters) > 0, "Type hints for data must not be empty."

        index_fields = []
        if len(index_parameters) >= 1:
            for level, index_parameter in enumerate(index_parameters):
                index_name = index_parameter.name
                index_dtype, index_spark_type = pandas_on_spark_type(
                    index_parameter.tpe)
                index_fields.append(
                    InternalField(
                        dtype=index_dtype,
                        struct_field=types.StructField(
                            name=index_name if index_name is not None else
                            SPARK_INDEX_NAME_FORMAT(level),
                            dataType=index_spark_type,
                        ),
                    ))
        else:
            # No type hint for index.
            assert len(index_parameters) == 0

        data_dtypes, data_spark_types = zip(
            *(pandas_on_spark_type(p.tpe) if isclass(p)
              and issubclass(p, NameTypeHolder) else pandas_on_spark_type(p)
              for p in data_parameters))
        data_names = [
            p.name if isclass(p) and issubclass(p, NameTypeHolder) else None
            for p in data_parameters
        ]
        data_fields = []
        for i, (data_name, data_dtype, data_spark_type) in enumerate(
                zip(data_names, data_dtypes, data_spark_types)):
            data_fields.append(
                InternalField(
                    dtype=data_dtype,
                    struct_field=types.StructField(
                        name=name_like_string(data_name)
                        if data_name is not None else ("c%s" % i),
                        dataType=data_spark_type,
                    ),
                ))

        return DataFrameType(index_fields=index_fields,
                             data_fields=data_fields)

    tpes = pandas_on_spark_type(tpe)
    if tpes is None:
        return UnknownType(tpe)
    else:
        return ScalarType(*tpes)
Example #17
0
    def attach_id_column(self, id_type: str, column: Name) -> "DataFrame":
        """
        Attach a column to be used as identifier of rows similar to the default index.

        See also `Default Index type
        <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.

        Parameters
        ----------
        id_type : string
            The id type.

            - 'sequence' : a sequence that increases one by one.

              .. note:: this uses Spark's Window without specifying partition specification.
                  This leads to move all data into single partition in single machine and
                  could cause serious performance degradation.
                  Avoid this method against very large dataset.

            - 'distributed-sequence' : a sequence that increases one by one,
              by group-by and group-map approach in a distributed manner.
            - 'distributed' : a monotonically increasing sequence simply by using PySpark’s
              monotonically_increasing_id function in a fully distributed manner.

        column : string or tuple of string
            The column name.

        Returns
        -------
        DataFrame
            The DataFrame attached the column.

        Examples
        --------
        >>> df = ps.DataFrame({"x": ['a', 'b', 'c']})
        >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column="id")
           x  id
        0  a   0
        1  b   1
        2  c   2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=0)
           x  0
        0  a  0
        1  b  1
        2  c  2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed", column=0.0)
        ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
           x  0.0
        0  a  ...
        1  b  ...
        2  c  ...

        For multi-index columns:

        >>> df = ps.DataFrame({("x", "y"): ['a', 'b', 'c']})
        >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column=("id-x", "id-y"))
           x id-x
           y id-y
        0  a    0
        1  b    1
        2  c    2

        >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=(0, 1.0))
           x   0
           y 1.0
        0  a   0
        1  b   1
        2  c   2
        """
        from pyspark.pandas.frame import DataFrame

        if id_type == "sequence":
            attach_func = InternalFrame.attach_sequence_column
        elif id_type == "distributed-sequence":
            attach_func = InternalFrame.attach_distributed_sequence_column
        elif id_type == "distributed":
            attach_func = InternalFrame.attach_distributed_column
        else:
            raise ValueError(
                "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'"
            )

        assert is_name_like_value(column, allow_none=False), column
        if not is_name_like_tuple(column):
            column = (column,)

        internal = self._psdf._internal

        if len(column) != internal.column_labels_level:
            raise ValueError(
                "The given column `{}` must be the same length as the existing columns.".format(
                    column
                )
            )
        elif column in internal.column_labels:
            raise ValueError(
                "The given column `{}` already exists.".format(name_like_string(column))
            )

        # Make sure the underlying Spark column names are the form of
        # `name_like_string(column_label)`.
        sdf = internal.spark_frame.select(
            [
                scol.alias(SPARK_INDEX_NAME_FORMAT(i))
                for i, scol in enumerate(internal.index_spark_columns)
            ]
            + [
                scol.alias(name_like_string(label))
                for scol, label in zip(internal.data_spark_columns, internal.column_labels)
            ]
        )
        sdf = attach_func(sdf, name_like_string(column))

        return DataFrame(
            InternalFrame(
                spark_frame=sdf,
                index_spark_columns=[
                    scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level)
                ],
                index_names=internal.index_names,
                index_fields=internal.index_fields,
                column_labels=internal.column_labels + [column],
                data_spark_columns=(
                    [scol_for(sdf, name_like_string(label)) for label in internal.column_labels]
                    + [scol_for(sdf, name_like_string(column))]
                ),
                data_fields=internal.data_fields
                + [
                    InternalField.from_struct_field(
                        StructField(name_like_string(column), LongType(), nullable=False)
                    )
                ],
                column_label_names=internal.column_label_names,
            ).resolved_copy
        )
Example #18
0
    def transform_batch(
        self, func: Callable[..., Union[pd.DataFrame, pd.Series]], *args: Any, **kwargs: Any
    ) -> DataFrameOrSeries:
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.pandas_on_spark.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a NumPy compound type style
            as below:

            >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[
            ...         (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame or Series

        See Also
        --------
        DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations.
        Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ps.DataFrame[int, [int, int]]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ps.DataFrame[("index", int), [('A', int), ('B', int)]]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)  # doctest: +NORMALIZE_WHITESPACE
               A  B
        index
        0      2  3
        1      4  5
        2      6  7

        >>> def plus_one_func(pdf) -> ps.Series[int]:
        ...     return pdf.B + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        dtype: int64

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> (df * -1).pandas_on_spark.transform_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        Note that you should not transform the index. The index information will not change.

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark.pandas.series import first_series
        from pyspark import pandas as ps

        assert callable(func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        should_retain_index = should_infer_schema
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        def apply_func(pdf: pd.DataFrame) -> pd.DataFrame:
            return func(pdf).to_frame()

        def pandas_series_func(
            f: Callable[[pd.DataFrame], pd.DataFrame], return_type: DataType
        ) -> "UserDefinedFunctionLike":
            ff = f

            @pandas_udf(returnType=return_type)  # type: ignore[call-overload]
            def udf(pdf: pd.DataFrame) -> pd.Series:
                return first_series(ff(pdf))

            return udf

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            log_advice(
                "If the type hints is not specified for `transform_batch`, "
                "it is expensive to infer the data type internally."
            )
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self._psdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed)
                )
            if len(transformed) != len(pdf):
                raise ValueError("transform_batch cannot produce aggregated results")
            psdf_or_psser = ps.from_pandas(transformed)

            if isinstance(psdf_or_psser, ps.Series):
                psser = cast(ps.Series, psdf_or_psser)

                field = psser._internal.data_fields[0].normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_series_func(output_func, return_type=field.spark_type)
                columns = self._psdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._psdf._internal.copy(
                    column_labels=psser._internal.column_labels,
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=psser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                psdf = cast(DataFrame, psdf_or_psser)
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return psdf

                index_fields = [
                    field.normalize_spark_type() for field in psdf._internal.index_fields
                ]
                data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields]

                return_schema = StructType(
                    [field.struct_field for field in index_fields + data_fields]
                )

                self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True  # type: ignore[arg-type]
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(  # type: ignore[call-overload]
                    output_func, returnType=return_schema
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                return DataFrame(
                    psdf._internal.with_new_sdf(
                        spark_frame=sdf, index_fields=index_fields, data_fields=data_fields
                    )
                )
        else:
            return_type = infer_return_type(original_func)
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig
                )
            if is_return_series:
                field = InternalField(
                    dtype=cast(SeriesType, return_type).dtype,
                    struct_field=StructField(
                        name=SPARK_DEFAULT_SERIES_NAME,
                        dataType=cast(SeriesType, return_type).spark_type,
                    ),
                ).normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_series_func(output_func, return_type=field.spark_type)
                columns = self._psdf._internal.spark_columns
                internal = self._psdf._internal.copy(
                    column_labels=[None],
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                index_fields = cast(DataFrameType, return_type).index_fields
                index_fields = [index_field.normalize_spark_type() for index_field in index_fields]
                data_fields = [
                    field.normalize_spark_type()
                    for field in cast(DataFrameType, return_type).data_fields
                ]
                normalized_fields = index_fields + data_fields
                return_schema = StructType([field.struct_field for field in normalized_fields])
                should_retain_index = len(index_fields) > 0

                self_applied = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=should_retain_index  # type: ignore[arg-type]
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(  # type: ignore[call-overload]
                    output_func, returnType=return_schema
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                index_spark_columns = None
                index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None

                if should_retain_index:
                    index_spark_columns = [
                        scol_for(sdf, index_field.struct_field.name) for index_field in index_fields
                    ]

                    if not any(
                        [
                            SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name)
                            for index_field in index_fields
                        ]
                    ):
                        index_names = [
                            (index_field.struct_field.name,) for index_field in index_fields
                        ]
                internal = InternalFrame(
                    spark_frame=sdf,
                    index_names=index_names,
                    index_spark_columns=index_spark_columns,
                    index_fields=index_fields,
                    data_fields=data_fields,
                )
                return DataFrame(internal)
Example #19
0
    def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex:
        if isinstance(right, (list, tuple)):
            from pyspark.pandas.series import first_series, scol_for
            from pyspark.pandas.frame import DataFrame
            from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, InternalField

            len_right = len(right)
            if len(left) != len(right):
                raise ValueError("Lengths must be equal")

            sdf = left._internal.spark_frame
            structed_scol = F.struct(
                sdf[NATURAL_ORDER_COLUMN_NAME],
                *left._internal.index_spark_columns,
                left.spark.column,
            )
            # The size of the list is expected to be small.
            collected_structed_scol = F.collect_list(structed_scol)
            # Sort the array by NATURAL_ORDER_COLUMN so that we can guarantee the order.
            collected_structed_scol = F.array_sort(collected_structed_scol)
            right_values_scol = F.array(*(F.lit(x) for x in right))
            index_scol_names = left._internal.index_spark_column_names
            scol_name = left._internal.spark_column_name_for(
                left._internal.column_labels[0])
            # Compare the values of left and right by using zip_with function.
            cond = F.zip_with(
                collected_structed_scol,
                right_values_scol,
                lambda x, y: F.struct(
                    *[
                        x[index_scol_name].alias(index_scol_name)
                        for index_scol_name in index_scol_names
                    ],
                    F.when(x[scol_name].isNull() | y.isNull(), False).
                    otherwise(x[scol_name] == y, ).alias(scol_name),
                ),
            ).alias(scol_name)
            # 1. `sdf_new` here looks like the below (the first field of each set is Index):
            # +----------------------------------------------------------+
            # |0                                                         |
            # +----------------------------------------------------------+
            # |[{0, false}, {1, true}, {2, false}, {3, true}, {4, false}]|
            # +----------------------------------------------------------+
            sdf_new = sdf.select(cond)
            # 2. `sdf_new` after the explode looks like the below:
            # +----------+
            # |       col|
            # +----------+
            # |{0, false}|
            # | {1, true}|
            # |{2, false}|
            # | {3, true}|
            # |{4, false}|
            # +----------+
            sdf_new = sdf_new.select(F.explode(scol_name))
            # 3. Here, the final `sdf_new` looks like the below:
            # +-----------------+-----+
            # |__index_level_0__|    0|
            # +-----------------+-----+
            # |                0|false|
            # |                1| true|
            # |                2|false|
            # |                3| true|
            # |                4|false|
            # +-----------------+-----+
            sdf_new = sdf_new.select("col.*")

            index_spark_columns = [
                scol_for(sdf_new, index_scol_name)
                for index_scol_name in index_scol_names
            ]
            data_spark_columns = [scol_for(sdf_new, scol_name)]

            internal = left._internal.copy(
                spark_frame=sdf_new,
                index_spark_columns=index_spark_columns,
                data_spark_columns=data_spark_columns,
                index_fields=[
                    InternalField.from_struct_field(index_field)
                    for index_field in sdf_new.select(
                        index_spark_columns).schema.fields
                ],
                data_fields=[
                    InternalField.from_struct_field(
                        sdf_new.select(data_spark_columns).schema.fields[0])
                ],
            )
            return first_series(DataFrame(internal))
        else:
            from pyspark.pandas.base import column_op

            return column_op(Column.__eq__)(left, right)
Example #20
0
    def transform(self, func: Callable[[Column], Column]) -> IndexOpsLike:
        """
        Applies a function that takes and returns a Spark column. It allows to natively
        apply a Spark function and column APIs with the Spark column internally used
        in Series or Index. The output length of the Spark column should be same as input's.

        .. note:: It requires to have the same input and output length; therefore,
            the aggregate Spark functions such as count does not work.

        Parameters
        ----------
        func : function
            Function to use for transforming the data by using Spark columns.

        Returns
        -------
        Series or Index

        Raises
        ------
        ValueError : If the output from the function is not a Spark column.

        Examples
        --------
        >>> from pyspark.sql.functions import log
        >>> df = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"])
        >>> df
           a  b
        0  1  4
        1  2  5
        2  3  6

        >>> df.a.spark.transform(lambda c: log(c))
        0    0.000000
        1    0.693147
        2    1.098612
        Name: a, dtype: float64

        >>> df.index.spark.transform(lambda c: c + 10)
        Int64Index([10, 11, 12], dtype='int64')

        >>> df.a.spark.transform(lambda c: c + df.b.spark.column)
        0    5
        1    7
        2    9
        Name: a, dtype: int64
        """
        from pyspark.pandas import MultiIndex

        if isinstance(self._data, MultiIndex):
            raise NotImplementedError(
                "MultiIndex does not support spark.transform yet.")
        output = func(self._data.spark.column)
        if not isinstance(output, Column):
            raise ValueError("The output of the function [%s] should be of a "
                             "pyspark.sql.Column; however, got [%s]." %
                             (func, type(output)))
        # Trigger the resolution so it throws an exception if anything does wrong
        # within the function, for example,
        # `df1.a.spark.transform(lambda _: F.col("non-existent"))`.
        field = InternalField.from_struct_field(
            self._data._internal.spark_frame.select(output).schema.fields[0])
        return self._data._with_new_scol(scol=output, field=field)
Example #21
0
    def insert(self, loc: int, item: Any) -> Index:
        """
        Make new MultiIndex inserting new item at location.

        Follows Python list.append semantics for negative values.

        .. versionchanged:: 3.4.0
           Raise IndexError when loc is out of bounds to follow Pandas 1.4+ behavior

        Parameters
        ----------
        loc : int
        item : object

        Returns
        -------
        new_index : MultiIndex

        Examples
        --------
        >>> psmidx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
        >>> psmidx.insert(3, ("h", "j"))  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z'),
                    ('h', 'j')],
                   )

        For negative values

        >>> psmidx.insert(-2, ("h", "j"))  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('h', 'j'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )
        """
        validate_index_loc(self, loc)
        loc = loc + len(self) if loc < 0 else loc

        index_name: List[Label] = [
            (name, ) for name in self._internal.index_spark_column_names
        ]
        sdf_before = self.to_frame(name=index_name)[:loc]._to_spark()
        sdf_middle = Index([item]).to_frame(name=index_name)._to_spark()
        sdf_after = self.to_frame(name=index_name)[loc:]._to_spark()
        sdf = sdf_before.union(sdf_middle).union(sdf_after)

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col)
                for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_fields=[
                InternalField(field.dtype)
                for field in self._internal.index_fields
            ],
        )
        return DataFrame(internal).index
Example #22
0
    def insert(self, loc: int, item: Any) -> Index:
        """
        Make new MultiIndex inserting new item at location.

        Follows Python list.append semantics for negative values.

        Parameters
        ----------
        loc : int
        item : object

        Returns
        -------
        new_index : MultiIndex

        Examples
        --------
        >>> psmidx = ps.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
        >>> psmidx.insert(3, ("h", "j"))  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('b', 'y'),
                    ('c', 'z'),
                    ('h', 'j')],
                   )

        For negative values

        >>> psmidx.insert(-2, ("h", "j"))  # doctest: +SKIP
        MultiIndex([('a', 'x'),
                    ('h', 'j'),
                    ('b', 'y'),
                    ('c', 'z')],
                   )
        """
        length = len(self)
        if loc < 0:
            loc = loc + length
            if loc < 0:
                raise IndexError(
                    "index {} is out of bounds for axis 0 with size {}".format(
                        (loc - length), length))
        else:
            if loc > length:
                raise IndexError(
                    "index {} is out of bounds for axis 0 with size {}".format(
                        loc, length))

        index_name = [
            (name, ) for name in self._internal.index_spark_column_names
        ]  # type: List[Label]
        sdf_before = self.to_frame(name=index_name)[:loc].to_spark()
        sdf_middle = Index([item]).to_frame(name=index_name).to_spark()
        sdf_after = self.to_frame(name=index_name)[loc:].to_spark()
        sdf = sdf_before.union(sdf_middle).union(sdf_after)

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col)
                for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_fields=[
                InternalField(field.dtype)
                for field in self._internal.index_fields
            ],
        )
        return DataFrame(internal).index
Example #23
0
    def transform_batch(self, func, *args, **kwargs) -> Union["DataFrame", "Series"]:
        """
        Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame.
        The pandas DataFrame given to the function is of a batch used internally. The length of
        each input and output should be the same.

        See also `Transform and apply a function
        <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_.

        .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark
            internally splits the input series into multiple batches and calls `func` with each
            batch multiple times. Therefore, operations such as global aggregations are impossible.
            See the example below.

            >>> # This case does not return the length of whole frame but of the batch internally
            ... # used.
            ... def length(pdf) -> ps.DataFrame[int]:
            ...     return pd.DataFrame([len(pdf)] * len(pdf))
            ...
            >>> df = ps.DataFrame({'A': range(1000)})
            >>> df.pandas_on_spark.transform_batch(length)  # doctest: +SKIP
                c0
            0   83
            1   83
            2   83
            ...

        .. note:: this API executes the function once to infer the type which is
            potentially expensive, for instance, when the dataset is created after
            aggregations or sorting.

            To avoid this, specify return type in ``func``, for instance, as below:

            >>> def plus_one(x) -> ps.DataFrame[float, float]:
            ...     return x + 1

            If the return type is specified, the output column names become
            `c0, c1, c2 ... cn`. These names are positionally mapped to the returned
            DataFrame in ``func``.

            To specify the column names, you can assign them in a pandas friendly style as below:

            >>> def plus_one(x) -> ps.DataFrame['a': float, 'b': float]:
            ...     return x + 1

            >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]})
            >>> def plus_one(x) -> ps.DataFrame[zip(pdf.dtypes, pdf.columns)]:
            ...     return x + 1

            When the given function returns DataFrame and has the return type annotated, the
            original index of the DataFrame will be lost and then a default index will be attached
            to the result. Please be careful about configuring the default index. See also
            `Default Index Type
            <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.

        Parameters
        ----------
        func : function
            Function to transform each pandas frame.
        *args
            Positional arguments to pass to func.
        **kwargs
            Keyword arguments to pass to func.

        Returns
        -------
        DataFrame or Series

        See Also
        --------
        DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations.
        Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks.

        Examples
        --------
        >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B'])
        >>> df
           A  B
        0  1  2
        1  3  4
        2  5  6

        >>> def plus_one_func(pdf) -> ps.DataFrame[int, int]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           c0  c1
        0   2   3
        1   4   5
        2   6   7

        >>> def plus_one_func(pdf) -> ps.DataFrame['A': int, 'B': int]:
        ...     return pdf + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> def plus_one_func(pdf) -> ps.Series[int]:
        ...     return pdf.B + 1
        >>> df.pandas_on_spark.transform_batch(plus_one_func)
        0    3
        1    5
        2    7
        dtype: int64

        You can also omit the type hints so pandas-on-Spark infers the return schema as below:

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1)
           A  B
        0  2  3
        1  4  5
        2  6  7

        >>> (df * -1).pandas_on_spark.transform_batch(abs)
           A  B
        0  1  2
        1  3  4
        2  5  6

        Note that you should not transform the index. The index information will not change.

        >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1)
        0    3
        1    5
        2    7
        Name: B, dtype: int64

        You can also specify extra arguments as below.

        >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3)
        0     8
        1    10
        2    12
        Name: B, dtype: int64
        """
        from pyspark.pandas.groupby import GroupBy
        from pyspark.pandas.frame import DataFrame
        from pyspark.pandas.series import first_series
        from pyspark import pandas as ps

        assert callable(func), "the first argument should be a callable function."
        spec = inspect.getfullargspec(func)
        return_sig = spec.annotations.get("return", None)
        should_infer_schema = return_sig is None
        original_func = func
        func = lambda o: original_func(o, *args, **kwargs)

        names = self._psdf._internal.to_internal_spark_frame.schema.names

        def pandas_concat(series):
            # The input can only be a DataFrame for struct from Spark 3.0.
            # This works around to make the input as a frame. See SPARK-27240
            pdf = pd.concat(series, axis=1)
            pdf.columns = names
            return pdf

        def apply_func(pdf):
            return func(pdf).to_frame()

        def pandas_extract(pdf, name):
            # This is for output to work around a DataFrame for struct
            # from Spark 3.0.  See SPARK-23836
            return pdf[name]

        def pandas_series_func(f):
            ff = f
            return lambda *series: first_series(ff(*series))

        def pandas_frame_func(f, field_name):
            ff = f
            return lambda *series: pandas_extract(ff(pandas_concat(series)), field_name)

        if should_infer_schema:
            # Here we execute with the first 1000 to get the return type.
            # If the records were less than 1000, it uses pandas API directly for a shortcut.
            limit = ps.get_option("compute.shortcut_limit")
            pdf = self._psdf.head(limit + 1)._to_internal_pandas()
            transformed = func(pdf)
            if not isinstance(transformed, (pd.DataFrame, pd.Series)):
                raise ValueError(
                    "The given function should return a frame; however, "
                    "the return type was %s." % type(transformed)
                )
            if len(transformed) != len(pdf):
                raise ValueError("transform_batch cannot produce aggregated results")
            psdf_or_psser = ps.from_pandas(transformed)

            if isinstance(psdf_or_psser, ps.Series):
                psser = cast(ps.Series, psdf_or_psser)

                field = psser._internal.data_fields[0].normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_udf(  # type: ignore
                    returnType=field.spark_type, functionType=PandasUDFType.SCALAR
                )(pandas_series_func(output_func))
                columns = self._psdf._internal.spark_columns
                # TODO: Index will be lost in this case.
                internal = self._psdf._internal.copy(
                    column_labels=psser._internal.column_labels,
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=psser._internal.column_label_names,
                )
                return first_series(DataFrame(internal))
            else:
                psdf = cast(DataFrame, psdf_or_psser)
                if len(pdf) <= limit:
                    # only do the short cut when it returns a frame to avoid
                    # operations on different dataframes in case of series.
                    return psdf

                index_fields = [
                    field.normalize_spark_type() for field in psdf._internal.index_fields
                ]
                data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields]

                return_schema = StructType(
                    [field.struct_field for field in index_fields + data_fields]
                )

                self_applied = DataFrame(self._psdf._internal.resolved_copy)  # type: DataFrame

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=True
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)(
                    output_func
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                return DataFrame(
                    psdf._internal.with_new_sdf(
                        spark_frame=sdf, index_fields=index_fields, data_fields=data_fields
                    )
                )
        else:
            return_type = infer_return_type(original_func)
            is_return_series = isinstance(return_type, SeriesType)
            is_return_dataframe = isinstance(return_type, DataFrameType)
            if not is_return_dataframe and not is_return_series:
                raise TypeError(
                    "The given function should specify a frame or series as its type "
                    "hints; however, the return type was %s." % return_sig
                )
            if is_return_series:
                field = InternalField(
                    dtype=cast(SeriesType, return_type).dtype,
                    struct_field=StructField(
                        name=SPARK_DEFAULT_SERIES_NAME,
                        dataType=cast(SeriesType, return_type).spark_type,
                    ),
                ).normalize_spark_type()

                return_schema = StructType([field.struct_field])
                output_func = GroupBy._make_pandas_df_builder_func(
                    self._psdf, apply_func, return_schema, retain_index=False
                )

                pudf = pandas_udf(  # type: ignore
                    returnType=field.spark_type, functionType=PandasUDFType.SCALAR
                )(pandas_series_func(output_func))
                columns = self._psdf._internal.spark_columns
                internal = self._psdf._internal.copy(
                    column_labels=[None],
                    data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)],
                    data_fields=[field],
                    column_label_names=None,
                )
                return first_series(DataFrame(internal))
            else:
                data_fields = [
                    field.normalize_spark_type()
                    for field in cast(DataFrameType, return_type).fields
                ]
                return_schema = StructType([field.struct_field for field in data_fields])

                self_applied = DataFrame(self._psdf._internal.resolved_copy)

                output_func = GroupBy._make_pandas_df_builder_func(
                    self_applied, func, return_schema, retain_index=False
                )
                columns = self_applied._internal.spark_columns

                pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)(
                    output_func
                )
                temp_struct_column = verify_temp_column_name(
                    self_applied._internal.spark_frame, "__temp_struct__"
                )
                applied = pudf(F.struct(*columns)).alias(temp_struct_column)
                sdf = self_applied._internal.spark_frame.select(applied)
                sdf = sdf.selectExpr("%s.*" % temp_struct_column)

                internal = InternalFrame(
                    spark_frame=sdf, index_spark_columns=None, data_fields=data_fields
                )
                return DataFrame(internal)