def _is_monotonic_decreasing(self) -> Series: window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1) cond = SF.lit(True) has_not_null = SF.lit(True) for scol in self._internal.index_spark_columns[::-1]: data_type = self._internal.spark_type_for(scol) prev = F.lag(scol, 1).over(window) compare = MultiIndex._comparator_for_monotonic_increasing(data_type) # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex. # Therefore, we should check `has_not_null` over the all levels. has_not_null = has_not_null & scol.isNotNull() cond = F.when(scol.eqNullSafe(prev), cond).otherwise(compare(scol, prev, Column.__lt__)) cond = has_not_null & (prev.isNull() | cond) cond_name = verify_temp_column_name( self._internal.spark_frame.select(self._internal.index_spark_columns), "__is_monotonic_decreasing_cond__", ) sdf = self._internal.spark_frame.select( self._internal.index_spark_columns + [cond.alias(cond_name)] ) internal = InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, col) for col in self._internal.index_spark_column_names ], index_names=self._internal.index_names, index_fields=self._internal.index_fields, ) return first_series(DataFrame(internal))
def predict( self, data: Union[DataFrame, pd.DataFrame]) -> Union[Series, pd.Series]: """ Returns a prediction on the data. If the data is a pandas-on-Spark DataFrame, the return is a pandas-on-Spark Series. If the data is a pandas Dataframe, the return is the expected output of the underlying pyfunc object (typically a pandas Series or a numpy array). """ if isinstance(data, pd.DataFrame): return self._model.predict(data) elif isinstance(data, DataFrame): return_col = self._model_udf(*data._internal.data_spark_columns) # TODO: the columns should be named according to the mlflow spec # However, this is only possible with spark >= 3.0 # s = F.struct(*data.columns) # return_col = self._model_udf(s) column_labels: List[Label] = [ (col, ) for col in data._internal.spark_frame.select( return_col).columns ] internal = data._internal.copy(column_labels=column_labels, data_spark_columns=[return_col], data_fields=None) return first_series(DataFrame(internal)) else: raise ValueError("unknown data type: {}".format( type(data).__name__))
def _transform_batch(self, func, return_type: Optional[Union[SeriesType, ScalarType]]): from pyspark.pandas.groupby import GroupBy from pyspark.pandas.series import Series, first_series from pyspark import pandas as ps if not isinstance(func, types.FunctionType): f = func func = lambda *args, **kwargs: f(*args, **kwargs) if return_type is None: # TODO: In this case, it avoids the shortcut for now (but only infers schema) # because it returns a series from a different DataFrame and it has a different # anchor. We should fix this to allow the shortcut or only allow to infer # schema. limit = ps.get_option("compute.shortcut_limit") pser = self._kser.head(limit + 1)._to_internal_pandas() transformed = pser.transform(func) kser = Series(transformed) # type: Series spark_return_type = force_decimal_precision_scale( as_nullable_spark_type(kser.spark.data_type)) dtype = kser.dtype else: spark_return_type = return_type.spark_type dtype = return_type.dtype kdf = self._kser.to_frame() columns = kdf._internal.spark_column_names def pandas_concat(series): # The input can only be a DataFrame for struct from Spark 3.0. # This works around to make the input as a frame. See SPARK-27240 pdf = pd.concat(series, axis=1) pdf.columns = columns return pdf def apply_func(pdf): return func(first_series(pdf)).to_frame() return_schema = StructType( [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)]) output_func = GroupBy._make_pandas_df_builder_func(kdf, apply_func, return_schema, retain_index=False) pudf = pandas_udf( lambda *series: first_series(output_func(pandas_concat(series))), returnType=spark_return_type, functionType=PandasUDFType.SCALAR, ) return self._kser._with_new_scol( scol=pudf(*kdf._internal.spark_columns).alias( self._kser._internal.spark_column_names[0]), dtype=dtype, )
def indexer_between_time( self, start_time: Union[datetime.time, str], end_time: Union[datetime.time, str], include_start: bool = True, include_end: bool = True, ) -> Index: """ Return index locations of values between particular times of day (example: 9:00-9:30AM). Parameters ---------- start_time, end_time : datetime.time, str Time passed either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p","%I%M%S%p"). include_start : bool, default True include_end : bool, default True Returns ------- values_between_time : Index of integers Examples -------- >>> psidx = ps.date_range("2000-01-01", periods=3, freq="T") >>> psidx # doctest: +NORMALIZE_WHITESPACE DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', '2000-01-01 00:02:00'], dtype='datetime64[ns]', freq=None) >>> psidx.indexer_between_time("00:01", "00:02").sort_values() Int64Index([1, 2], dtype='int64') >>> psidx.indexer_between_time("00:01", "00:02", include_end=False) Int64Index([1], dtype='int64') >>> psidx.indexer_between_time("00:01", "00:02", include_start=False) Int64Index([2], dtype='int64') """ @no_type_check def pandas_between_time(pdf) -> ps.DataFrame[int]: return pdf.between_time(start_time, end_time, include_start, include_end) psdf = self.to_frame()[[]] id_column_name = verify_temp_column_name(psdf, "__id_column__") psdf = psdf.pandas_on_spark.attach_id_column("distributed-sequence", id_column_name) with ps.option_context("compute.default_index_type", "distributed"): # The attached index in the statement below will be dropped soon, # so we enforce “distributed” default index type psdf = psdf.pandas_on_spark.apply_batch(pandas_between_time) return ps.Index(first_series(psdf).rename(self.name))
def analyzed(self) -> "ps.Series": """ Returns a new Series with the analyzed Spark DataFrame. After multiple operations, the underlying Spark plan could grow huge and make the Spark planner take a long time to finish the planning. This function is for the workaround to avoid it. .. note:: After analyzed, operations between the analyzed Series and the original one will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`. Returns ------- Series Examples -------- >>> ser = ps.Series([1, 2, 3]) >>> ser 0 1 1 2 2 3 dtype: int64 The analyzed one should return the same value. >>> ser.spark.analyzed 0 1 1 2 2 3 dtype: int64 However, it won't work with the same anchor Series. >>> ser + ser.spark.analyzed Traceback (most recent call last): ... ValueError: ... enable 'compute.ops_on_diff_frames' option. >>> with ps.option_context('compute.ops_on_diff_frames', True): ... (ser + ser.spark.analyzed).sort_index() 0 2 1 4 2 6 dtype: int64 """ from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import first_series return first_series(DataFrame(self._data._internal.resolved_copy))
def indexer_at_time(self, time: Union[datetime.time, str], asof: bool = False) -> Index: """ Return index locations of values at particular time of day (example: 9:30AM). Parameters ---------- time : datetime.time or str Time passed in either as object (datetime.time) or as string in appropriate format ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p"). Returns ------- values_at_time : Index of integers Examples -------- >>> psidx = ps.date_range("2000-01-01", periods=3, freq="T") >>> psidx # doctest: +NORMALIZE_WHITESPACE DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 00:01:00', '2000-01-01 00:02:00'], dtype='datetime64[ns]', freq=None) >>> psidx.indexer_at_time("00:00") Int64Index([0], dtype='int64') >>> psidx.indexer_at_time("00:01") Int64Index([1], dtype='int64') """ if asof: raise NotImplementedError("'asof' argument is not supported") @no_type_check def pandas_at_time(pdf) -> ps.DataFrame[int]: return pdf.at_time(time, asof) psdf = self.to_frame()[[]] id_column_name = verify_temp_column_name(psdf, "__id_column__") psdf = psdf.pandas_on_spark.attach_id_column("distributed-sequence", id_column_name) with ps.option_context("compute.default_index_type", "distributed"): # The attached index in the statement below will be dropped soon, # so we enforce “distributed” default index type psdf = psdf.pandas_on_spark.apply_batch(pandas_at_time) return ps.Index(first_series(psdf).rename(self.name))
def predict(self, data): """ Returns a prediction on the data. If the data is a pandas-on-Spark DataFrame, the return is a pandas-on-Spark Series. If the data is a pandas Dataframe, the return is the expected output of the underlying pyfunc object (typically a pandas Series or a numpy array). """ if isinstance(data, pd.DataFrame): return self._model.predict(data) if isinstance(data, DataFrame): return_col = self._model_udf(*data._internal.data_spark_columns) # TODO: the columns should be named according to the mlflow spec # However, this is only possible with spark >= 3.0 # s = F.struct(*data.columns) # return_col = self._model_udf(s) column_labels = [(col, ) for col in data._internal.spark_frame.select( return_col).columns] internal = data._internal.copy(column_labels=column_labels, data_spark_columns=[return_col], data_dtypes=None) return first_series(DataFrame(internal))
def pudf(*series: pd.Series) -> pd.Series: return first_series(output_func(pandas_concat(*series)))
def apply_func(pdf: pd.DataFrame) -> pd.DataFrame: return func(first_series(pdf)).to_frame()
def udf(pdf: pd.DataFrame) -> pd.Series: return first_series(ff(pdf))
def transform_batch( self, func: Callable[..., Union[pd.DataFrame, pd.Series]], *args: Any, **kwargs: Any ) -> DataFrameOrSeries: """ Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. The length of each input and output should be the same. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ps.DataFrame[int]: ... return pd.DataFrame([len(pdf)] * len(pdf)) ... >>> df = ps.DataFrame({'A': range(1000)}) >>> df.pandas_on_spark.transform_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ps.DataFrame[int, [float, float]]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a NumPy compound type style as below: >>> def plus_one(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.dtypes, pdf.columns)]: ... return x + 1 Parameters ---------- func : function Function to transform each pandas frame. *args Positional arguments to pass to func. **kwargs Keyword arguments to pass to func. Returns ------- DataFrame or Series See Also -------- DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations. Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def plus_one_func(pdf) -> ps.DataFrame[int, [int, int]]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) c0 c1 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.DataFrame[("index", int), [('A', int), ('B', int)]]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) # doctest: +NORMALIZE_WHITESPACE A B index 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.Series[int]: ... return pdf.B + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) 0 3 1 5 2 7 dtype: int64 You can also omit the type hints so pandas-on-Spark infers the return schema as below: >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1) A B 0 2 3 1 4 5 2 6 7 >>> (df * -1).pandas_on_spark.transform_batch(abs) A B 0 1 2 1 3 4 2 5 6 Note that you should not transform the index. The index information will not change. >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1) 0 3 1 5 2 7 Name: B, dtype: int64 You can also specify extra arguments as below. >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3) 0 8 1 10 2 12 Name: B, dtype: int64 """ from pyspark.pandas.groupby import GroupBy from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import first_series from pyspark import pandas as ps assert callable(func), "the first argument should be a callable function." spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None should_retain_index = should_infer_schema original_func = func func = lambda o: original_func(o, *args, **kwargs) def apply_func(pdf: pd.DataFrame) -> pd.DataFrame: return func(pdf).to_frame() def pandas_series_func( f: Callable[[pd.DataFrame], pd.DataFrame], return_type: DataType ) -> "UserDefinedFunctionLike": ff = f @pandas_udf(returnType=return_type) # type: ignore[call-overload] def udf(pdf: pd.DataFrame) -> pd.Series: return first_series(ff(pdf)) return udf if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. log_advice( "If the type hints is not specified for `transform_batch`, " "it is expensive to infer the data type internally." ) limit = ps.get_option("compute.shortcut_limit") pdf = self._psdf.head(limit + 1)._to_internal_pandas() transformed = func(pdf) if not isinstance(transformed, (pd.DataFrame, pd.Series)): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(transformed) ) if len(transformed) != len(pdf): raise ValueError("transform_batch cannot produce aggregated results") psdf_or_psser = ps.from_pandas(transformed) if isinstance(psdf_or_psser, ps.Series): psser = cast(ps.Series, psdf_or_psser) field = psser._internal.data_fields[0].normalize_spark_type() return_schema = StructType([field.struct_field]) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_series_func(output_func, return_type=field.spark_type) columns = self._psdf._internal.spark_columns # TODO: Index will be lost in this case. internal = self._psdf._internal.copy( column_labels=psser._internal.column_labels, data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)], data_fields=[field], column_label_names=psser._internal.column_label_names, ) return first_series(DataFrame(internal)) else: psdf = cast(DataFrame, psdf_or_psser) if len(pdf) <= limit: # only do the short cut when it returns a frame to avoid # operations on different dataframes in case of series. return psdf index_fields = [ field.normalize_spark_type() for field in psdf._internal.index_fields ] data_fields = [field.normalize_spark_type() for field in psdf._internal.data_fields] return_schema = StructType( [field.struct_field for field in index_fields + data_fields] ) self_applied: DataFrame = DataFrame(self._psdf._internal.resolved_copy) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True # type: ignore[arg-type] ) columns = self_applied._internal.spark_columns pudf = pandas_udf( # type: ignore[call-overload] output_func, returnType=return_schema ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) return DataFrame( psdf._internal.with_new_sdf( spark_frame=sdf, index_fields=index_fields, data_fields=data_fields ) ) else: return_type = infer_return_type(original_func) is_return_series = isinstance(return_type, SeriesType) is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe and not is_return_series: raise TypeError( "The given function should specify a frame or series as its type " "hints; however, the return type was %s." % return_sig ) if is_return_series: field = InternalField( dtype=cast(SeriesType, return_type).dtype, struct_field=StructField( name=SPARK_DEFAULT_SERIES_NAME, dataType=cast(SeriesType, return_type).spark_type, ), ).normalize_spark_type() return_schema = StructType([field.struct_field]) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_series_func(output_func, return_type=field.spark_type) columns = self._psdf._internal.spark_columns internal = self._psdf._internal.copy( column_labels=[None], data_spark_columns=[pudf(F.struct(*columns)).alias(field.name)], data_fields=[field], column_label_names=None, ) return first_series(DataFrame(internal)) else: index_fields = cast(DataFrameType, return_type).index_fields index_fields = [index_field.normalize_spark_type() for index_field in index_fields] data_fields = [ field.normalize_spark_type() for field in cast(DataFrameType, return_type).data_fields ] normalized_fields = index_fields + data_fields return_schema = StructType([field.struct_field for field in normalized_fields]) should_retain_index = len(index_fields) > 0 self_applied = DataFrame(self._psdf._internal.resolved_copy) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=should_retain_index # type: ignore[arg-type] ) columns = self_applied._internal.spark_columns pudf = pandas_udf( # type: ignore[call-overload] output_func, returnType=return_schema ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) index_spark_columns = None index_names: Optional[List[Optional[Tuple[Any, ...]]]] = None if should_retain_index: index_spark_columns = [ scol_for(sdf, index_field.struct_field.name) for index_field in index_fields ] if not any( [ SPARK_INDEX_NAME_PATTERN.match(index_field.struct_field.name) for index_field in index_fields ] ): index_names = [ (index_field.struct_field.name,) for index_field in index_fields ] internal = InternalFrame( spark_frame=sdf, index_names=index_names, index_spark_columns=index_spark_columns, index_fields=index_fields, data_fields=data_fields, ) return DataFrame(internal)
def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, (list, tuple)): from pyspark.pandas.series import first_series, scol_for from pyspark.pandas.frame import DataFrame from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, InternalField len_right = len(right) if len(left) != len(right): raise ValueError("Lengths must be equal") sdf = left._internal.spark_frame structed_scol = F.struct( sdf[NATURAL_ORDER_COLUMN_NAME], *left._internal.index_spark_columns, left.spark.column, ) # The size of the list is expected to be small. collected_structed_scol = F.collect_list(structed_scol) # Sort the array by NATURAL_ORDER_COLUMN so that we can guarantee the order. collected_structed_scol = F.array_sort(collected_structed_scol) right_values_scol = F.array(*(F.lit(x) for x in right)) index_scol_names = left._internal.index_spark_column_names scol_name = left._internal.spark_column_name_for( left._internal.column_labels[0]) # Compare the values of left and right by using zip_with function. cond = F.zip_with( collected_structed_scol, right_values_scol, lambda x, y: F.struct( *[ x[index_scol_name].alias(index_scol_name) for index_scol_name in index_scol_names ], F.when(x[scol_name].isNull() | y.isNull(), False). otherwise(x[scol_name] == y, ).alias(scol_name), ), ).alias(scol_name) # 1. `sdf_new` here looks like the below (the first field of each set is Index): # +----------------------------------------------------------+ # |0 | # +----------------------------------------------------------+ # |[{0, false}, {1, true}, {2, false}, {3, true}, {4, false}]| # +----------------------------------------------------------+ sdf_new = sdf.select(cond) # 2. `sdf_new` after the explode looks like the below: # +----------+ # | col| # +----------+ # |{0, false}| # | {1, true}| # |{2, false}| # | {3, true}| # |{4, false}| # +----------+ sdf_new = sdf_new.select(F.explode(scol_name)) # 3. Here, the final `sdf_new` looks like the below: # +-----------------+-----+ # |__index_level_0__| 0| # +-----------------+-----+ # | 0|false| # | 1| true| # | 2|false| # | 3| true| # | 4|false| # +-----------------+-----+ sdf_new = sdf_new.select("col.*") index_spark_columns = [ scol_for(sdf_new, index_scol_name) for index_scol_name in index_scol_names ] data_spark_columns = [scol_for(sdf_new, scol_name)] internal = left._internal.copy( spark_frame=sdf_new, index_spark_columns=index_spark_columns, data_spark_columns=data_spark_columns, index_fields=[ InternalField.from_struct_field(index_field) for index_field in sdf_new.select( index_spark_columns).schema.fields ], data_fields=[ InternalField.from_struct_field( sdf_new.select(data_spark_columns).schema.fields[0]) ], ) return first_series(DataFrame(internal)) else: from pyspark.pandas.base import column_op return column_op(Column.__eq__)(left, right)
def apply_func(pdf): return func(first_series(pdf)).to_frame()
def pandas_series_func(f): ff = f return lambda *series: first_series(ff(*series))
def transform_batch(self, func, *args, **kwargs) -> Union["DataFrame", "Series"]: """ Transform chunks with a function that takes pandas DataFrame and outputs pandas DataFrame. The pandas DataFrame given to the function is of a batch used internally. The length of each input and output should be the same. See also `Transform and apply a function <https://koalas.readthedocs.io/en/latest/user_guide/transform_apply.html>`_. .. note:: the `func` is unable to access to the whole input frame. pandas-on-Spark internally splits the input series into multiple batches and calls `func` with each batch multiple times. Therefore, operations such as global aggregations are impossible. See the example below. >>> # This case does not return the length of whole frame but of the batch internally ... # used. ... def length(pdf) -> ps.DataFrame[int]: ... return pd.DataFrame([len(pdf)] * len(pdf)) ... >>> df = ps.DataFrame({'A': range(1000)}) >>> df.pandas_on_spark.transform_batch(length) # doctest: +SKIP c0 0 83 1 83 2 83 ... .. note:: this API executes the function once to infer the type which is potentially expensive, for instance, when the dataset is created after aggregations or sorting. To avoid this, specify return type in ``func``, for instance, as below: >>> def plus_one(x) -> ps.DataFrame[float, float]: ... return x + 1 If the return type is specified, the output column names become `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. To specify the column names, you can assign them in a pandas friendly style as below: >>> def plus_one(x) -> ps.DataFrame['a': float, 'b': float]: ... return x + 1 >>> pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 4, 5]}) >>> def plus_one(x) -> ps.DataFrame[zip(pdf.dtypes, pdf.columns)]: ... return x + 1 When the given function returns DataFrame and has the return type annotated, the original index of the DataFrame will be lost and then a default index will be attached to the result. Please be careful about configuring the default index. See also `Default Index Type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- func : function Function to transform each pandas frame. *args Positional arguments to pass to func. **kwargs Keyword arguments to pass to func. Returns ------- DataFrame or Series See Also -------- DataFrame.pandas_on_spark.apply_batch: For row/columnwise operations. Series.pandas_on_spark.transform_batch: transform the search as each pandas chunks. Examples -------- >>> df = ps.DataFrame([(1, 2), (3, 4), (5, 6)], columns=['A', 'B']) >>> df A B 0 1 2 1 3 4 2 5 6 >>> def plus_one_func(pdf) -> ps.DataFrame[int, int]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) c0 c1 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.DataFrame['A': int, 'B': int]: ... return pdf + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) A B 0 2 3 1 4 5 2 6 7 >>> def plus_one_func(pdf) -> ps.Series[int]: ... return pdf.B + 1 >>> df.pandas_on_spark.transform_batch(plus_one_func) 0 3 1 5 2 7 dtype: int64 You can also omit the type hints so pandas-on-Spark infers the return schema as below: >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf + 1) A B 0 2 3 1 4 5 2 6 7 >>> (df * -1).pandas_on_spark.transform_batch(abs) A B 0 1 2 1 3 4 2 5 6 Note that you should not transform the index. The index information will not change. >>> df.pandas_on_spark.transform_batch(lambda pdf: pdf.B + 1) 0 3 1 5 2 7 Name: B, dtype: int64 You can also specify extra arguments as below. >>> df.pandas_on_spark.transform_batch(lambda pdf, a, b, c: pdf.B + a + b + c, 1, 2, c=3) 0 8 1 10 2 12 Name: B, dtype: int64 """ from pyspark.pandas.groupby import GroupBy from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import first_series from pyspark import pandas as ps assert callable(func), "the first argument should be a callable function." spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) should_infer_schema = return_sig is None original_func = func func = lambda o: original_func(o, *args, **kwargs) names = self._psdf._internal.to_internal_spark_frame.schema.names def pandas_concat(series): # The input can only be a DataFrame for struct from Spark 3.0. # This works around to make the input as a frame. See SPARK-27240 pdf = pd.concat(series, axis=1) pdf.columns = names return pdf def apply_func(pdf): return func(pdf).to_frame() def pandas_extract(pdf, name): # This is for output to work around a DataFrame for struct # from Spark 3.0. See SPARK-23836 return pdf[name] def pandas_series_func(f): ff = f return lambda *series: first_series(ff(*series)) def pandas_frame_func(f, field_name): ff = f return lambda *series: pandas_extract(ff(pandas_concat(series)), field_name) if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. limit = ps.get_option("compute.shortcut_limit") pdf = self._psdf.head(limit + 1)._to_internal_pandas() transformed = func(pdf) if not isinstance(transformed, (pd.DataFrame, pd.Series)): raise ValueError( "The given function should return a frame; however, " "the return type was %s." % type(transformed) ) if len(transformed) != len(pdf): raise ValueError("transform_batch cannot produce aggregated results") psdf_or_psser = ps.from_pandas(transformed) if isinstance(psdf_or_psser, ps.Series): psser = cast(ps.Series, psdf_or_psser) spark_return_type = force_decimal_precision_scale( as_nullable_spark_type(psser.spark.data_type) ) return_schema = StructType( [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)] ) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_udf(returnType=spark_return_type, functionType=PandasUDFType.SCALAR)( pandas_series_func(output_func) ) columns = self._psdf._internal.spark_columns # TODO: Index will be lost in this case. internal = self._psdf._internal.copy( column_labels=psser._internal.column_labels, data_spark_columns=[ pudf(F.struct(*columns)).alias(psser._internal.data_spark_column_names[0]) ], data_dtypes=psser._internal.data_dtypes, column_label_names=psser._internal.column_label_names, ) return first_series(DataFrame(internal)) else: psdf = cast(DataFrame, psdf_or_psser) if len(pdf) <= limit: # only do the short cut when it returns a frame to avoid # operations on different dataframes in case of series. return psdf # Force nullability. return_schema = force_decimal_precision_scale( as_nullable_spark_type(psdf._internal.to_internal_spark_frame.schema) ) self_applied = DataFrame(self._psdf._internal.resolved_copy) # type: DataFrame output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=True ) columns = self_applied._internal.spark_columns pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)( output_func ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) return DataFrame(psdf._internal.with_new_sdf(sdf)) else: return_type = infer_return_type(original_func) is_return_series = isinstance(return_type, SeriesType) is_return_dataframe = isinstance(return_type, DataFrameType) if not is_return_dataframe and not is_return_series: raise TypeError( "The given function should specify a frame or series as its type " "hints; however, the return type was %s." % return_sig ) if is_return_series: spark_return_type = force_decimal_precision_scale( as_nullable_spark_type(cast(SeriesType, return_type).spark_type) ) return_schema = StructType( [StructField(SPARK_DEFAULT_SERIES_NAME, spark_return_type)] ) output_func = GroupBy._make_pandas_df_builder_func( self._psdf, apply_func, return_schema, retain_index=False ) pudf = pandas_udf(returnType=spark_return_type, functionType=PandasUDFType.SCALAR)( pandas_series_func(output_func) ) columns = self._psdf._internal.spark_columns internal = self._psdf._internal.copy( column_labels=[None], data_spark_columns=[pudf(F.struct(*columns)).alias(SPARK_DEFAULT_SERIES_NAME)], data_dtypes=[cast(SeriesType, return_type).dtype], column_label_names=None, ) return first_series(DataFrame(internal)) else: return_schema = cast(DataFrameType, return_type).spark_type self_applied = DataFrame(self._psdf._internal.resolved_copy) output_func = GroupBy._make_pandas_df_builder_func( self_applied, func, return_schema, retain_index=False ) columns = self_applied._internal.spark_columns pudf = pandas_udf(returnType=return_schema, functionType=PandasUDFType.SCALAR)( output_func ) temp_struct_column = verify_temp_column_name( self_applied._internal.spark_frame, "__temp_struct__" ) applied = pudf(F.struct(*columns)).alias(temp_struct_column) sdf = self_applied._internal.spark_frame.select(applied) sdf = sdf.selectExpr("%s.*" % temp_struct_column) internal = InternalFrame( spark_frame=sdf, index_spark_columns=None, data_dtypes=cast(DataFrameType, return_type).dtypes, ) return DataFrame(internal)
def sum(self) -> Series: return first_series(self._downsample("sum").fillna(0.0))
def max(self) -> Series: return first_series(self._downsample("max"))
def mean(self) -> Series: return first_series(self._downsample("mean"))
def std(self) -> Series: return first_series(self._downsample("std"))
def apply(self, func) -> "ps.Series": """ Applies a function that takes and returns a Spark column. It allows to natively apply a Spark function and column APIs with the Spark column internally used in Series or Index. .. note:: It forces to lose the index and end up with using default index. It is preferred to use :meth:`Series.spark.transform` or `:meth:`DataFrame.spark.apply` with specifying the `inedx_col`. .. note:: It does not require to have the same length of the input and output. However, it requires to create a new DataFrame internally which will require to set `compute.ops_on_diff_frames` to compute even with the same origin DataFrame that is expensive, whereas :meth:`Series.spark.transform` does not require it. Parameters ---------- func : function Function to apply the function against the data by using Spark columns. Returns ------- Series Raises ------ ValueError : If the output from the function is not a Spark column. Examples -------- >>> from pyspark import pandas as ps >>> from pyspark.sql.functions import count, lit >>> df = ps.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"]) >>> df a b 0 1 4 1 2 5 2 3 6 >>> df.a.spark.apply(lambda c: count(c)) 0 3 Name: a, dtype: int64 >>> df.a.spark.apply(lambda c: c + df.b.spark.column) 0 5 1 7 2 9 Name: a, dtype: int64 """ from pyspark.pandas.frame import DataFrame from pyspark.pandas.series import Series, first_series from pyspark.pandas.internal import HIDDEN_COLUMNS output = func(self._data.spark.column) if not isinstance(output, Column): raise ValueError( "The output of the function [%s] should be of a " "pyspark.sql.Column; however, got [%s]." % (func, type(output)) ) assert isinstance(self._data, Series) sdf = self._data._internal.spark_frame.drop(*HIDDEN_COLUMNS).select(output) # Lose index. return first_series(DataFrame(sdf)).rename(self._data.name)
def var(self) -> Series: return first_series(self._downsample("var"))
def pandas_series_func(f, by_pass): ff = f if by_pass: return lambda *series: first_series(ff(*series)) else: return lambda *series: first_series(ff(pandas_concat(series)))
def udf(*series: pd.Series) -> pd.Series: return first_series(ff(*series))