Ejemplo n.º 1
0
    def mapInPandas(
        self, func: "PandasMapIterFunction", schema: Union[StructType, str]
    ) -> "DataFrame":
        """
        Maps an iterator of batches in the current :class:`DataFrame` using a Python native
        function that takes and outputs a pandas DataFrame, and returns the result as a
        :class:`DataFrame`.

        The function should take an iterator of `pandas.DataFrame`\\s and return
        another iterator of `pandas.DataFrame`\\s. All columns are passed
        together as an iterator of `pandas.DataFrame`\\s to the function and the
        returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`.
        Each `pandas.DataFrame` size can be controlled by
        `spark.sql.execution.arrow.maxRecordsPerBatch`.

        .. versionadded:: 3.0.0

        Parameters
        ----------
        func : function
            a Python native function that takes an iterator of `pandas.DataFrame`\\s, and
            outputs an iterator of `pandas.DataFrame`\\s.
        schema : :class:`pyspark.sql.types.DataType` or str
            the return type of the `func` in PySpark. The value can be either a
            :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        Examples
        --------
        >>> from pyspark.sql.functions import pandas_udf
        >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))
        >>> def filter_func(iterator):
        ...     for pdf in iterator:
        ...         yield pdf[pdf.id == 1]
        >>> df.mapInPandas(filter_func, df.schema).show()  # doctest: +SKIP
        +---+---+
        | id|age|
        +---+---+
        |  1| 21|
        +---+---+

        Notes
        -----
        This API is experimental

        See Also
        --------
        pyspark.sql.functions.pandas_udf
        """
        from pyspark.sql import DataFrame
        from pyspark.sql.pandas.functions import pandas_udf

        assert isinstance(self, DataFrame)

        # The usage of the pandas_udf is internal so type checking is disabled.
        udf = pandas_udf(
            func, returnType=schema, functionType=PythonEvalType.SQL_MAP_PANDAS_ITER_UDF
        )  # type: ignore[call-overload]
        udf_column = udf(*[self[col] for col in self.columns])
        jdf = self._jdf.mapInPandas(udf_column._jc.expr())  # type: ignore[operator]
        return DataFrame(jdf, self.sql_ctx)
Ejemplo n.º 2
0
    def test_scalar_udf_type_hint(self):
        df = self.spark.range(10).selectExpr("id", "id as v")

        def plus_one(v: Union[pd.Series, pd.DataFrame]) -> pd.Series:
            return v + 1

        plus_one = pandas_udf("long")(plus_one)
        actual = df.select(plus_one(df.v).alias("plus_one"))
        expected = df.selectExpr("(v + 1) as plus_one")
        assert_frame_equal(expected.toPandas(), actual.toPandas())
Ejemplo n.º 3
0
    def test_group_agg_udf_type_hint(self):
        df = self.spark.range(10).selectExpr("id", "id as v")

        def weighted_mean(v: pd.Series, w: pd.Series) -> float:
            return np.average(v, weights=w)

        weighted_mean = pandas_udf("double")(weighted_mean)

        actual = df.groupby("id").agg(weighted_mean(df.v, lit(1.0))).sort("id")
        expected = df.groupby("id").agg(mean(df.v).alias("weighted_mean(v, 1.0)")).sort("id")
        assert_frame_equal(expected.toPandas(), actual.toPandas())
Ejemplo n.º 4
0
    def test_scalar_iter_udf_type_hint(self):
        df = self.spark.range(10).selectExpr("id", "id as v")

        def plus_one(itr: Iterator[pd.Series]) -> Iterator[pd.Series]:
            for s in itr:
                yield s + 1

        plus_one = pandas_udf("long")(plus_one)

        actual = df.select(plus_one(df.v).alias("plus_one"))
        expected = df.selectExpr("(v + 1) as plus_one")
        assert_frame_equal(expected.toPandas(), actual.toPandas())
Ejemplo n.º 5
0
    def __init__(self,
                 spark: SparkSession,
                 df: pyspark.sql.dataframe,
                 window_seconds: int = 10):
        self.spark = spark

        # patch time window
        self.df = patch_time_windows(df=df, window_seconds=window_seconds)

        # extract packet rate
        #self.df = (
        #self.df
        #.withColumn('packet_rate', col('packet') / col('duration'))
        #)

        # extract packet rate
        #self.df = (
        #self.df
        #.withColumn('packet_rate', col('packet') / col('duration'))
        #)

        # extract byte rate
        #self.df = (
        #self.df
        #.withColumn('byte_rate', col('num_of_bytes') / col('duration'))
        #)

        # udf functions of extraction methods
        self.extract_num_flow_udf = pandas_udf(self.extract_num_flow, 'double')
        self.mean_udf = pandas_udf(self.mean, 'double')
        self.std_udf = pandas_udf(self.std, 'double')
        self.entropy_udf = pandas_udf(self.entropy, 'double')
        self.port_proportion_udf = pandas_udf(self.port_proportion,
                                              'array<double>')
        self.build_label_udf = pandas_udf(self.build_label, 'string')
Ejemplo n.º 6
0
    def test_group_agg_udf_type_hint(self):
        df = self.spark.range(10).selectExpr("id", "id as v")
        exec(
            "import numpy as np\n"
            "def weighted_mean(v: pd.Series, w: pd.Series) -> float:\n"
            "    return np.average(v, weights=w)", self.local)

        weighted_mean = pandas_udf("double")(self.local["weighted_mean"])

        actual = df.groupby('id').agg(weighted_mean(df.v, lit(1.0))).sort('id')
        expected = df.groupby('id').agg(
            mean(df.v).alias('weighted_mean(v, 1.0)')).sort('id')
        assert_frame_equal(expected.toPandas(), actual.toPandas())
Ejemplo n.º 7
0
    def test_scalar_udf_type_hint(self):
        df = self.spark.range(10).selectExpr("id", "id as v")

        exec(
            "import typing\n"
            "def plus_one(v: typing.Union[pd.Series, pd.DataFrame]) -> pd.Series:\n"
            "    return v + 1", self.local)

        plus_one = pandas_udf("long")(self.local["plus_one"])

        actual = df.select(plus_one(df.v).alias("plus_one"))
        expected = df.selectExpr("(v + 1) as plus_one")
        assert_frame_equal(expected.toPandas(), actual.toPandas())
Ejemplo n.º 8
0
    def test_scalar_iter_udf_type_hint(self):
        df = self.spark.range(10).selectExpr("id", "id as v")

        exec(
            "import typing\n"
            "def plus_one(itr: typing.Iterator[pd.Series]) -> typing.Iterator[pd.Series]:\n"
            "    for s in itr:\n"
            "        yield s + 1", self.local)

        plus_one = pandas_udf("long")(self.local["plus_one"])

        actual = df.select(plus_one(df.v).alias("plus_one"))
        expected = df.selectExpr("(v + 1) as plus_one")
        assert_frame_equal(expected.toPandas(), actual.toPandas())
Ejemplo n.º 9
0
    def mapInPandas(self, func, schema):
        """
        Maps an iterator of batches in the current :class:`DataFrame` using a Python native
        function that takes and outputs a pandas DataFrame, and returns the result as a
        :class:`DataFrame`.

        The function should take an iterator of `pandas.DataFrame`\\s and return
        another iterator of `pandas.DataFrame`\\s. All columns are passed
        together as an iterator of `pandas.DataFrame`\\s to the function and the
        returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`.
        Each `pandas.DataFrame` size can be controlled by
        `spark.sql.execution.arrow.maxRecordsPerBatch`.

        :param func: a Python native function that takes an iterator of `pandas.DataFrame`\\s, and
            outputs an iterator of `pandas.DataFrame`\\s.
        :param schema: the return type of the `func` in PySpark. The value can be either a
            :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.

        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
        >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))
        >>> def filter_func(batch_iter):
        ...     for pdf in batch_iter:
        ...         yield pdf[pdf.id == 1]
        >>> df.mapInPandas(filter_func, df.schema).show()  # doctest: +SKIP
        +---+---+
        | id|age|
        +---+---+
        |  1| 21|
        +---+---+

        .. seealso:: :meth:`pyspark.sql.functions.pandas_udf`

        .. note:: Experimental
        """
        from pyspark.sql import DataFrame
        from pyspark.sql.pandas.functions import pandas_udf

        assert isinstance(self, DataFrame)

        udf = pandas_udf(func,
                         returnType=schema,
                         functionType=PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)
        udf_column = udf(*[self[col] for col in self.columns])
        jdf = self._jdf.mapInPandas(udf_column._jc.expr())
        return DataFrame(jdf, self.sql_ctx)
Ejemplo n.º 10
0
from pyspark.sql import functions as F, Column
from pyspark.sql.pandas.functions import pandas_udf
from pyspark.sql.types import DoubleType, LongType, BooleanType

from pyspark.pandas.base import IndexOpsMixin
from pyspark.pandas.spark import functions as SF

unary_np_spark_mappings = OrderedDict({
    "abs":
    F.abs,
    "absolute":
    F.abs,
    "arccos":
    F.acos,
    "arccosh":
    pandas_udf(lambda s: np.arccosh(s),
               DoubleType()),  # type: ignore[call-overload]
    "arcsin":
    F.asin,
    "arcsinh":
    pandas_udf(lambda s: np.arcsinh(s),
               DoubleType()),  # type: ignore[call-overload]
    "arctan":
    F.atan,
    "arctanh":
    pandas_udf(lambda s: np.arctanh(s),
               DoubleType()),  # type: ignore[call-overload]
    "bitwise_not":
    F.bitwiseNOT,
    "cbrt":
    F.cbrt,
    "ceil":
Ejemplo n.º 11
0
import numpy as np
from pyspark.sql import functions as F, Column
from pyspark.sql.pandas.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import DoubleType, LongType, BooleanType

if TYPE_CHECKING:
    from pyspark.pandas.base import IndexOpsMixin


unary_np_spark_mappings = OrderedDict(
    {
        "abs": F.abs,
        "absolute": F.abs,
        "arccos": F.acos,
        "arccosh": pandas_udf(lambda s: np.arccosh(s), DoubleType(), PandasUDFType.SCALAR),
        "arcsin": F.asin,
        "arcsinh": pandas_udf(lambda s: np.arcsinh(s), DoubleType(), PandasUDFType.SCALAR),
        "arctan": F.atan,
        "arctanh": pandas_udf(lambda s: np.arctanh(s), DoubleType(), PandasUDFType.SCALAR),
        "bitwise_not": F.bitwiseNOT,
        "cbrt": F.cbrt,
        "ceil": F.ceil,
        # It requires complex type which pandas-on-Spark does not support yet
        "conj": lambda _: NotImplemented,
        "conjugate": lambda _: NotImplemented,  # It requires complex type
        "cos": F.cos,
        "cosh": pandas_udf(lambda s: np.cosh(s), DoubleType(), PandasUDFType.SCALAR),
        "deg2rad": pandas_udf(lambda s: np.deg2rad(s), DoubleType(), PandasUDFType.SCALAR),
        "degrees": F.degrees,
        "exp": F.exp,