def transform_batch( self, func: Callable[..., Union[pd.DataFrame, pd.Series]], *args: Any, **kwargs: Any ) -> DataFrameOrSeries: """ Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. The length of each input and output should be the same. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ps.DataFrame[int]: ... return pd.DataFrame([len(pdf)] * len(pdf)) ... >>> df = ps.DataFrame({'A': range(1000)}) >>> df.pandas_on_spark.transform_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a NumPy compound type style as below: >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]: ... return x + 1 Parameters ---------- func : function Function to transform each pandas frame. *args Positional arguments to pass to func. **kwargs Keyword arguments to pass to func. Returns ------- DataFrame or Series See Also -------- DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations. Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def plus_one_func(pdf) -> ps.DataFrame[int, [int, int]]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) c0 c1 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.DataFrame[("index", int), [('A', int), ('B', int)]]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) # doctest: +NORMALIZE_WHITESPACE A B index 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.Series[int]: ... return pdf.B + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) 0 3 1 5 2 7 dtype: int64 You can also omit the type hints so pandas-on-Spark infers the return schema as below: >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1) A B 0 2 3 1 4 5 2 6 7 >>> (df * -1).pandas_on_spark.transform_batch(abs) A B 0 1 2 1 3 4 2 5 6 Note that you should not transform the index. The index information will not change. >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1) 0 3 1 5 2 7 Name: B, dtype: int64 You can also specify extra arguments as below. >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3) 0 8 1 10 2 12 Name: B, dtype: int64 """ from pyspark.pandas.groupby import GroupBy from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import first_series from pyspark import pandas as ps assert callable(func), "the first argument should be a callable function." spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None should_retain_index = should_infer_schema original_func = func func = lambda o: original_func(o, *args, **kwargs) def apply_func(pdf: pd.DataFrame) -> pd.DataFrame: return func(pdf).to_frame() def pandas_series_func( f: Callable[[pd.DataFrame], pd.DataFrame], return_type: DataType ) -> "UserDefinedFunctionLike": ff = f @pandas_udf(returnType=return_type) # type: ignore[call-overload] def udf(pdf: pd.DataFrame) -> pd.Series: return first_series(ff(pdf)) return udf if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. log_advice( "If the type hints is not specified for `transform_batch`, " "it is expensive to infer the data type internally." ) limit = ps.get_option("compute.shortcut_limit") pdf = self._psdf.head(limit + 1)._to_internal_pandas() transformed = func(pdf) if not isinstance(transformed, (pd.DataFrame, pd.Series)): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(transformed) ) if len(transformed) != len(pdf): raise ValueError("transform_batch cannot produce aggregated results") psdf_or_psser = ps.from_pandas(transformed) if isinstance(psdf_or_psser, ps.Series): psser = cast(ps.Series, psdf_or_psser) field = psser._internal.data_fields[0].normalize_spark_type() return_schema = StructType([field.struct_field]) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_series_func(output_func, return_type=field.spark_type) columns = self._psdf._internal.spark_columns # TODO: Index will be lost in this case. internal = self._psdf._internal.copy( column_labels=psser._internal.column_labels, data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)], data_fields=[field], column_label_names=psser._internal.column_label_names, ) return first_series(DataFrame(internal)) else: psdf = cast(DataFrame, psdf_or_psser) if len(pdf) <= limit: # only do the short cut when it returns a frame to avoid # operations on different dataframes in case of series. return psdf index_fields = [ field.normalize_spark_type() for field in psdf._internal.index_fields ] data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields] return_schema = StructType( [field.struct_field for field in index_fields + data_fields] ) self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True # type: ignore[arg-type] ) columns = self_applied._internal.spark_columns pudf = pandas_udf( # type: ignore[call-overload] output_func, returnType=return_schema ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) return DataFrame( psdf._internal.with_new_sdf( spark_frame=sdf, index_fields=index_fields, data_fields=data_fields ) ) else: return_type = infer_return_type(original_func) is_return_series = isinstance(return_type, SeriesType) is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe and not is_return_series: raise TypeError( "The given function should specify a frame or series as its type " "hints; however, the return type was %s." % return_sig ) if is_return_series: field = InternalField( dtype=cast(SeriesType, return_type).dtype, struct_field=StructField( name=SPARK_DEFAULT_SERIES_NAME, dataType=cast(SeriesType, return_type).spark_type, ), ).normalize_spark_type() return_schema = StructType([field.struct_field]) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_series_func(output_func, return_type=field.spark_type) columns = self._psdf._internal.spark_columns internal = self._psdf._internal.copy( column_labels=[None], data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)], data_fields=[field], column_label_names=None, ) return first_series(DataFrame(internal)) else: index_fields = cast(DataFrameType, return_type).index_fields index_fields = [index_field.normalize_spark_type() for index_field in index_fields] data_fields = [ field.normalize_spark_type() for field in cast(DataFrameType, return_type).data_fields ] normalized_fields = index_fields + data_fields return_schema = StructType([field.struct_field for field in normalized_fields]) should_retain_index = len(index_fields) > 0 self_applied = DataFrame(self._psdf._internal.resolved_copy) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=should_retain_index # type: ignore[arg-type] ) columns = self_applied._internal.spark_columns pudf = pandas_udf( # type: ignore[call-overload] output_func, returnType=return_schema ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) index_spark_columns = None index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None if should_retain_index: index_spark_columns = [ scol_for(sdf, index_field.struct_field.name) for index_field in index_fields ] if not any( [ SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name) for index_field in index_fields ] ): index_names = [ (index_field.struct_field.name,) for index_field in index_fields ] internal = InternalFrame( spark_frame=sdf, index_names=index_names, index_spark_columns=index_spark_columns, index_fields=index_fields, data_fields=data_fields, ) return DataFrame(internal)
def apply_batch( self, func: Callable[..., pd.DataFrame], args: Tuple = (), **kwds: Any ) -> "DataFrame": """ Apply a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ps.DataFrame[int, [int]]: ... return pd.DataFrame([len(pdf)]) ... >>> df = ps.DataFrame({'A': range(1000)}) >>> df.pandas_on_spark.apply_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... 10 83 11 83 .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a NumPy compound type style as below: >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]: ... return x + 1 Parameters ---------- func : function Function to apply to each pandas frame. args : tuple Positional arguments to pass to `func` in addition to the array/series. **kwds Additional keyword arguments to pass as keywords arguments to `func`. Returns ------- DataFrame See Also -------- DataFrame.apply: For row/columnwise operations. DataFrame.applymap: For elementwise operations. DataFrame.aggregate: Only perform aggregating type operations. DataFrame.transform: Only perform transforming type operations. Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def query_func(pdf) -> ps.DataFrame[int, [int, int]]: ... return pdf.query('A == 1') >>> df.pandas_on_spark.apply_batch(query_func) c0 c1 0 1 2 >>> def query_func(pdf) -> ps.DataFrame[("idx", int), [("A", int), ("B", int)]]: ... return pdf.query('A == 1') >>> df.pandas_on_spark.apply_batch(query_func) # doctest: +NORMALIZE_WHITESPACE A B idx 0 1 2 You can also omit the type hints so pandas-on-Spark infers the return schema as below: >>> df.pandas_on_spark.apply_batch(lambda pdf: pdf.query('A == 1')) A B 0 1 2 You can also specify extra arguments. >>> def calculation(pdf, y, z) -> ps.DataFrame[int, [int, int]]: ... return pdf ** y + z >>> df.pandas_on_spark.apply_batch(calculation, args=(10,), z=20) c0 c1 0 21 1044 1 59069 1048596 2 9765645 60466196 You can also use ``np.ufunc`` and built-in functions as input. >>> df.pandas_on_spark.apply_batch(np.add, args=(10,)) A B 0 11 12 1 13 14 2 15 16 >>> (df * -1).pandas_on_spark.apply_batch(abs) A B 0 1 2 1 3 4 2 5 6 """ # TODO: codes here partially duplicate `DataFrame.apply`. Can we deduplicate? from pyspark.pandas.groupby import GroupBy from pyspark.pandas.frame import DataFrame from pyspark import pandas as ps if not isinstance(func, FunctionType): assert callable(func), "the first argument should be a callable function." f = func func = lambda *args, **kwargs: f(*args, **kwargs) spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None original_func = func func = lambda o: original_func(o, *args, **kwds) self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy) if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. log_advice( "If the type hints is not specified for `apply_batch`, " "it is expensive to infer the data type internally." ) limit = ps.get_option("compute.shortcut_limit") pdf = self_applied.head(limit + 1)._to_internal_pandas() applied = func(pdf) if not isinstance(applied, pd.DataFrame): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(applied) ) psdf: DataFrame = DataFrame(applied) if len(pdf) <= limit: return psdf index_fields = [field.normalize_spark_type() for field in psdf._internal.index_fields] data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields] return_schema = StructType([field.struct_field for field in index_fields + data_fields]) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True ) sdf = self_applied._internal.spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema ) # If schema is inferred, we can restore indexes too. internal = psdf._internal.with_new_sdf( spark_frame=sdf, index_fields=index_fields, data_fields=data_fields ) else: return_type = infer_return_type(original_func) is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe: raise TypeError( "The given function should specify a frame as its type " "hints; however, the return type was %s." % return_sig ) index_fields = cast(DataFrameType, return_type).index_fields should_retain_index = len(index_fields) > 0 return_schema = cast(DataFrameType, return_type).spark_type output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=should_retain_index ) sdf = self_applied._internal.to_internal_spark_frame.mapInPandas( lambda iterator: map(output_func, iterator), schema=return_schema ) index_spark_columns = None index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None if should_retain_index: index_spark_columns = [ scol_for(sdf, index_field.struct_field.name) for index_field in index_fields ] if not any( [ SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name) for index_field in index_fields ] ): index_names = [(index_field.struct_field.name,) for index_field in index_fields] internal = InternalFrame( spark_frame=sdf, index_names=index_names, index_spark_columns=index_spark_columns, index_fields=index_fields, data_fields=cast(DataFrameType, return_type).data_fields, ) return DataFrame(internal)