def mapInPandas( self, func: "PandasMapIterFunction", schema: Union[StructType, str] ) -> "DataFrame": """ Maps an iterator of batches in the current :class:`DataFrame` using a Python native function that takes and outputs a pandas DataFrame, and returns the result as a :class:`DataFrame`. The function should take an iterator of `pandas.DataFrame`\\s and return another iterator of `pandas.DataFrame`\\s. All columns are passed together as an iterator of `pandas.DataFrame`\\s to the function and the returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`. Each `pandas.DataFrame` size can be controlled by `spark.sql.execution.arrow.maxRecordsPerBatch`. .. versionadded:: 3.0.0 Parameters ---------- func : function a Python native function that takes an iterator of `pandas.DataFrame`\\s, and outputs an iterator of `pandas.DataFrame`\\s. schema : :class:`pyspark.sql.types.DataType` or str the return type of the `func` in PySpark. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. Examples -------- >>> from pyspark.sql.functions import pandas_udf >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age")) >>> def filter_func(iterator): ... for pdf in iterator: ... yield pdf[pdf.id == 1] >>> df.mapInPandas(filter_func, df.schema).show() # doctest: +SKIP +---+---+ | id|age| +---+---+ | 1| 21| +---+---+ Notes ----- This API is experimental See Also -------- pyspark.sql.functions.pandas_udf """ from pyspark.sql import DataFrame from pyspark.sql.pandas.functions import pandas_udf assert isinstance(self, DataFrame) # The usage of the pandas_udf is internal so type checking is disabled. udf = pandas_udf( func, returnType=schema, functionType=PythonEvalType.SQL_MAP_PANDAS_ITER_UDF ) # type: ignore[call-overload] udf_column = udf(*[self[col] for col in self.columns]) jdf = self._jdf.mapInPandas(udf_column._jc.expr()) # type: ignore[operator] return DataFrame(jdf, self.sql_ctx)
def test_scalar_udf_type_hint(self): df = self.spark.range(10).selectExpr("id", "id as v") def plus_one(v: Union[pd.Series, pd.DataFrame]) -> pd.Series: return v + 1 plus_one = pandas_udf("long")(plus_one) actual = df.select(plus_one(df.v).alias("plus_one")) expected = df.selectExpr("(v + 1) as plus_one") assert_frame_equal(expected.toPandas(), actual.toPandas())
def test_group_agg_udf_type_hint(self): df = self.spark.range(10).selectExpr("id", "id as v") def weighted_mean(v: pd.Series, w: pd.Series) -> float: return np.average(v, weights=w) weighted_mean = pandas_udf("double")(weighted_mean) actual = df.groupby("id").agg(weighted_mean(df.v, lit(1.0))).sort("id") expected = df.groupby("id").agg(mean(df.v).alias("weighted_mean(v, 1.0)")).sort("id") assert_frame_equal(expected.toPandas(), actual.toPandas())
def test_scalar_iter_udf_type_hint(self): df = self.spark.range(10).selectExpr("id", "id as v") def plus_one(itr: Iterator[pd.Series]) -> Iterator[pd.Series]: for s in itr: yield s + 1 plus_one = pandas_udf("long")(plus_one) actual = df.select(plus_one(df.v).alias("plus_one")) expected = df.selectExpr("(v + 1) as plus_one") assert_frame_equal(expected.toPandas(), actual.toPandas())
def __init__(self, spark: SparkSession, df: pyspark.sql.dataframe, window_seconds: int = 10): self.spark = spark # patch time window self.df = patch_time_windows(df=df, window_seconds=window_seconds) # extract packet rate #self.df = ( #self.df #.withColumn('packet_rate', col('packet') / col('duration')) #) # extract packet rate #self.df = ( #self.df #.withColumn('packet_rate', col('packet') / col('duration')) #) # extract byte rate #self.df = ( #self.df #.withColumn('byte_rate', col('num_of_bytes') / col('duration')) #) # udf functions of extraction methods self.extract_num_flow_udf = pandas_udf(self.extract_num_flow, 'double') self.mean_udf = pandas_udf(self.mean, 'double') self.std_udf = pandas_udf(self.std, 'double') self.entropy_udf = pandas_udf(self.entropy, 'double') self.port_proportion_udf = pandas_udf(self.port_proportion, 'array<double>') self.build_label_udf = pandas_udf(self.build_label, 'string')
def test_group_agg_udf_type_hint(self): df = self.spark.range(10).selectExpr("id", "id as v") exec( "import numpy as np\n" "def weighted_mean(v: pd.Series, w: pd.Series) -> float:\n" " return np.average(v, weights=w)", self.local) weighted_mean = pandas_udf("double")(self.local["weighted_mean"]) actual = df.groupby('id').agg(weighted_mean(df.v, lit(1.0))).sort('id') expected = df.groupby('id').agg( mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id') assert_frame_equal(expected.toPandas(), actual.toPandas())
def test_scalar_udf_type_hint(self): df = self.spark.range(10).selectExpr("id", "id as v") exec( "import typing\n" "def plus_one(v: typing.Union[pd.Series, pd.DataFrame]) -> pd.Series:\n" " return v + 1", self.local) plus_one = pandas_udf("long")(self.local["plus_one"]) actual = df.select(plus_one(df.v).alias("plus_one")) expected = df.selectExpr("(v + 1) as plus_one") assert_frame_equal(expected.toPandas(), actual.toPandas())
def test_scalar_iter_udf_type_hint(self): df = self.spark.range(10).selectExpr("id", "id as v") exec( "import typing\n" "def plus_one(itr: typing.Iterator[pd.Series]) -> typing.Iterator[pd.Series]:\n" " for s in itr:\n" " yield s + 1", self.local) plus_one = pandas_udf("long")(self.local["plus_one"]) actual = df.select(plus_one(df.v).alias("plus_one")) expected = df.selectExpr("(v + 1) as plus_one") assert_frame_equal(expected.toPandas(), actual.toPandas())
def mapInPandas(self, func, schema): """ Maps an iterator of batches in the current :class:`DataFrame` using a Python native function that takes and outputs a pandas DataFrame, and returns the result as a :class:`DataFrame`. The function should take an iterator of `pandas.DataFrame`\\s and return another iterator of `pandas.DataFrame`\\s. All columns are passed together as an iterator of `pandas.DataFrame`\\s to the function and the returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`. Each `pandas.DataFrame` size can be controlled by `spark.sql.execution.arrow.maxRecordsPerBatch`. :param func: a Python native function that takes an iterator of `pandas.DataFrame`\\s, and outputs an iterator of `pandas.DataFrame`\\s. :param schema: the return type of the `func` in PySpark. The value can be either a :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string. >>> from pyspark.sql.functions import pandas_udf, PandasUDFType >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age")) >>> def filter_func(batch_iter): ... for pdf in batch_iter: ... yield pdf[pdf.id == 1] >>> df.mapInPandas(filter_func, df.schema).show() # doctest: +SKIP +---+---+ | id|age| +---+---+ | 1| 21| +---+---+ .. seealso:: :meth:`pyspark.sql.functions.pandas_udf` .. note:: Experimental """ from pyspark.sql import DataFrame from pyspark.sql.pandas.functions import pandas_udf assert isinstance(self, DataFrame) udf = pandas_udf(func, returnType=schema, functionType=PythonEvalType.SQL_MAP_PANDAS_ITER_UDF) udf_column = udf(*[self[col] for col in self.columns]) jdf = self._jdf.mapInPandas(udf_column._jc.expr()) return DataFrame(jdf, self.sql_ctx)
from pyspark.sql import functions as F, Column from pyspark.sql.pandas.functions import pandas_udf from pyspark.sql.types import DoubleType, LongType, BooleanType from pyspark.pandas.base import IndexOpsMixin from pyspark.pandas.spark import functions as SF unary_np_spark_mappings = OrderedDict({ "abs": F.abs, "absolute": F.abs, "arccos": F.acos, "arccosh": pandas_udf(lambda s: np.arccosh(s), DoubleType()), # type: ignore[call-overload] "arcsin": F.asin, "arcsinh": pandas_udf(lambda s: np.arcsinh(s), DoubleType()), # type: ignore[call-overload] "arctan": F.atan, "arctanh": pandas_udf(lambda s: np.arctanh(s), DoubleType()), # type: ignore[call-overload] "bitwise_not": F.bitwiseNOT, "cbrt": F.cbrt, "ceil":
import numpy as np from pyspark.sql import functions as F, Column from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType from pyspark.sql.types import DoubleType, LongType, BooleanType if TYPE_CHECKING: from pyspark.pandas.base import IndexOpsMixin unary_np_spark_mappings = OrderedDict( { "abs": F.abs, "absolute": F.abs, "arccos": F.acos, "arccosh": pandas_udf(lambda s: np.arccosh(s), DoubleType(), PandasUDFType.SCALAR), "arcsin": F.asin, "arcsinh": pandas_udf(lambda s: np.arcsinh(s), DoubleType(), PandasUDFType.SCALAR), "arctan": F.atan, "arctanh": pandas_udf(lambda s: np.arctanh(s), DoubleType(), PandasUDFType.SCALAR), "bitwise_not": F.bitwiseNOT, "cbrt": F.cbrt, "ceil": F.ceil, # It requires complex type which pandas-on-Spark does not support yet "conj": lambda _: NotImplemented, "conjugate": lambda _: NotImplemented, # It requires complex type "cos": F.cos, "cosh": pandas_udf(lambda s: np.cosh(s), DoubleType(), PandasUDFType.SCALAR), "deg2rad": pandas_udf(lambda s: np.deg2rad(s), DoubleType(), PandasUDFType.SCALAR), "degrees": F.degrees, "exp": F.exp,