コード例 #1
0
ファイル: worker.py プロジェクト: XpressAI/spark
def wrap_bounded_window_agg_pandas_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def wrapped(begin_index, end_index, *series):
        import pandas as pd
        result = []

        # Index operation is faster on np.ndarray,
        # So we turn the index series into np array
        # here for performance
        begin_array = begin_index.values
        end_array = end_index.values

        for i in range(len(begin_array)):
            # Note: Create a slice from a series for each window is
            #       actually pretty expensive. However, there
            #       is no easy way to reduce cost here.
            # Note: s.iloc[i : j] is about 30% faster than s[i: j], with
            #       the caveat that the created slices shares the same
            #       memory with s. Therefore, user are not allowed to
            #       change the value of input series inside the window
            #       function. It is rare that user needs to modify the
            #       input series in the window function, and therefore,
            #       it is be a reasonable restriction.
            # Note: Calling reset_index on the slices will increase the cost
            #       of creating slices by about 100%. Therefore, for performance
            #       reasons we don't do it here.
            series_slices = [
                s.iloc[begin_array[i]:end_array[i]] for s in series
            ]
            result.append(f(*series_slices))
        return pd.Series(result)

    return lambda *a: (wrapped(*a), arrow_return_type)
コード例 #2
0
ファイル: worker.py プロジェクト: zhengruifeng/spark
def wrap_grouped_map_pandas_udf(f, return_type, argspec):
    def wrapped(key_series, value_series):
        import pandas as pd

        if len(argspec.args) == 1:
            result = f(pd.concat(value_series, axis=1))
        elif len(argspec.args) == 2:
            key = tuple(s[0] for s in key_series)
            result = f(key, pd.concat(value_series, axis=1))

        if not isinstance(result, pd.DataFrame):
            raise TypeError(
                "Return type of the user-defined function should be "
                "pandas.DataFrame, but is {}".format(type(result))
            )
        # the number of columns of result have to match the return type
        # but it is fine for result to have no columns at all if it is empty
        if not (
            len(result.columns) == len(return_type) or len(result.columns) == 0 and result.empty
        ):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type), len(result.columns))
            )
        return result

    return lambda k, v: [(wrapped(k, v), to_arrow_type(return_type))]
コード例 #3
0
ファイル: worker.py プロジェクト: XpressAI/spark
def wrap_cogrouped_map_pandas_udf(f, return_type, argspec):
    def wrapped(left_key_series, left_value_series, right_key_series,
                right_value_series):
        import pandas as pd

        left_df = pd.concat(left_value_series, axis=1)
        right_df = pd.concat(right_value_series, axis=1)

        if len(argspec.args) == 2:
            result = f(left_df, right_df)
        elif len(argspec.args) == 3:
            key_series = left_key_series if not left_df.empty else right_key_series
            key = tuple(s[0] for s in key_series)
            result = f(key, left_df, right_df)
        if not isinstance(result, pd.DataFrame):
            raise TypeError(
                "Return type of the user-defined function should be "
                "pandas.DataFrame, but is {}".format(type(result)))
        if not len(result.columns) == len(return_type):
            raise RuntimeError(
                "Number of columns of the returned pandas.DataFrame "
                "doesn't match specified schema. "
                "Expected: {} Actual: {}".format(len(return_type),
                                                 len(result.columns)))
        return result

    return lambda kl, vl, kr, vr: [(wrapped(kl, vl, kr, vr),
                                    to_arrow_type(return_type))]
コード例 #4
0
ファイル: worker.py プロジェクト: XpressAI/spark
def wrap_grouped_agg_pandas_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def wrapped(*series):
        import pandas as pd
        result = f(*series)
        return pd.Series([result])

    return lambda *a: (wrapped(*a), arrow_return_type)
コード例 #5
0
def wrap_pandas_iter_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def verify_result_type(result):
        if not hasattr(result, "__len__"):
            pd_type = "Pandas.DataFrame" if type(return_type) == StructType else "Pandas.Series"
            raise TypeError("Return type of the user-defined function should be "
                            "{}, but is {}".format(pd_type, type(result)))
        return result

    return lambda *iterator: map(lambda res: (res, arrow_return_type),
                                 map(verify_result_type, f(*iterator)))
コード例 #6
0
ファイル: worker.py プロジェクト: XpressAI/spark
def wrap_unbounded_window_agg_pandas_udf(f, return_type):
    # This is similar to grouped_agg_pandas_udf, the only difference
    # is that window_agg_pandas_udf needs to repeat the return value
    # to match window length, where grouped_agg_pandas_udf just returns
    # the scalar value.
    arrow_return_type = to_arrow_type(return_type)

    def wrapped(*series):
        import pandas as pd
        result = f(*series)
        return pd.Series([result]).repeat(len(series[0]))

    return lambda *a: (wrapped(*a), arrow_return_type)
コード例 #7
0
ファイル: typehints.py プロジェクト: Halo9Pan/dive-spark
def spark_type_to_pandas_dtype(
    spark_type: types.DataType, *, use_extension_dtypes: bool = False
) -> Dtype:
    """Return the given Spark DataType to pandas dtype."""

    if use_extension_dtypes and extension_dtypes_available:
        # IntegralType
        if isinstance(spark_type, types.ByteType):
            return Int8Dtype()
        elif isinstance(spark_type, types.ShortType):
            return Int16Dtype()
        elif isinstance(spark_type, types.IntegerType):
            return Int32Dtype()
        elif isinstance(spark_type, types.LongType):
            return Int64Dtype()

        if extension_object_dtypes_available:
            # BooleanType
            if isinstance(spark_type, types.BooleanType):
                return BooleanDtype()
            # StringType
            elif isinstance(spark_type, types.StringType):
                return StringDtype()

        # FractionalType
        if extension_float_dtypes_available:
            if isinstance(spark_type, types.FloatType):
                return Float32Dtype()
            elif isinstance(spark_type, types.DoubleType):
                return Float64Dtype()

    if isinstance(
        spark_type,
        (
            types.DateType,
            types.NullType,
            types.ArrayType,
            types.MapType,
            types.StructType,
            types.UserDefinedType,
        ),
    ):
        return np.dtype("object")
    elif isinstance(spark_type, types.TimestampType):
        return np.dtype("datetime64[ns]")
    else:
        return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
コード例 #8
0
def wrap_scalar_pandas_udf(f, return_type):
    arrow_return_type = to_arrow_type(return_type)

    def verify_result_type(result):
        if not hasattr(result, "__len__"):
            pd_type = "Pandas.DataFrame" if type(return_type) == StructType else "Pandas.Series"
            raise TypeError("Return type of the user-defined function should be "
                            "{}, but is {}".format(pd_type, type(result)))
        return result

    def verify_result_length(result, length):
        if len(result) != length:
            raise RuntimeError("Result vector from pandas_udf was not the required length: "
                               "expected %d, got %d" % (length, len(result)))
        return result

    return lambda *a: (verify_result_length(
        verify_result_type(f(*a)), len(a[0])), arrow_return_type)
コード例 #9
0
    def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
        """
        Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting
        to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the
        data types will be used to coerce the data in Pandas to Arrow conversion.
        """
        from pyspark.sql import SparkSession
        from pyspark.sql.dataframe import DataFrame

        assert isinstance(self, SparkSession)

        from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
        from pyspark.sql.types import TimestampType
        from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type
        from pyspark.sql.pandas.utils import require_minimum_pandas_version, \
            require_minimum_pyarrow_version

        require_minimum_pandas_version()
        require_minimum_pyarrow_version()

        from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
        import pyarrow as pa

        # Create the Spark schema from list of names passed in with Arrow types
        if isinstance(schema, (list, tuple)):
            arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False)
            struct = StructType()
            prefer_timestamp_ntz = is_timestamp_ntz_preferred()
            for name, field in zip(schema, arrow_schema):
                struct.add(name,
                           from_arrow_type(field.type, prefer_timestamp_ntz),
                           nullable=field.nullable)
            schema = struct

        # Determine arrow types to coerce data when creating batches
        if isinstance(schema, StructType):
            arrow_types = [to_arrow_type(f.dataType) for f in schema.fields]
        elif isinstance(schema, DataType):
            raise ValueError(
                "Single data type %s is not supported with Arrow" %
                str(schema))
        else:
            # Any timestamps must be coerced to be compatible with Spark
            arrow_types = [
                to_arrow_type(TimestampType())
                if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None
                for t in pdf.dtypes
            ]

        # Slice the DataFrame to be batched
        step = -(-len(pdf) // self.sparkContext.defaultParallelism
                 )  # round int up
        pdf_slices = (pdf.iloc[start:start + step]
                      for start in range(0, len(pdf), step))

        # Create list of Arrow (columns, type) for serializer dump_stream
        arrow_data = [[(c, t)
                       for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)
                       ] for pdf_slice in pdf_slices]

        jsqlContext = self._wrapped._jsqlContext

        safecheck = self._wrapped._conf.arrowSafeTypeConversion()
        col_by_name = True  # col by name only applies to StructType columns, can't happen here
        ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name)

        def reader_func(temp_filename):
            return self._jvm.PythonSQLUtils.readArrowStreamFromFile(
                jsqlContext, temp_filename)

        def create_RDD_server():
            return self._jvm.ArrowRDDServer(jsqlContext)

        # Create Spark DataFrame from Arrow stream file, using one batch per partition
        jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func,
                                          create_RDD_server)
        jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(),
                                                   jsqlContext)
        df = DataFrame(jdf, self._wrapped)
        df._schema = schema
        return df
コード例 #10
0
    def returnType(self) -> DataType:
        # This makes sure this is called after SparkContext is initialized.
        # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string.
        if self._returnType_placeholder is None:
            if isinstance(self._returnType, DataType):
                self._returnType_placeholder = self._returnType
            else:
                self._returnType_placeholder = _parse_datatype_string(
                    self._returnType)

        if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or \
                self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF:
            try:
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid return type with scalar Pandas UDFs: %s is "
                    "not supported" % str(self._returnType_placeholder))
        elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid return type with grouped map Pandas UDFs or "
                        "at groupby.applyInPandas: %s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError(
                    "Invalid return type for grouped map Pandas "
                    "UDFs or at groupby.applyInPandas: return type must be a "
                    "StructType.")
        elif self.evalType == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid return type in mapInPandas: "
                        "%s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError("Invalid return type in mapInPandas: "
                                "return type must be a StructType.")
        elif self.evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
            if isinstance(self._returnType_placeholder, StructType):
                try:
                    to_arrow_type(self._returnType_placeholder)
                except TypeError:
                    raise NotImplementedError(
                        "Invalid return type in cogroup.applyInPandas: "
                        "%s is not supported" %
                        str(self._returnType_placeholder))
            else:
                raise TypeError(
                    "Invalid return type in cogroup.applyInPandas: "
                    "return type must be a StructType.")
        elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF:
            try:
                # StructType is not yet allowed as a return type, explicitly check here to fail fast
                if isinstance(self._returnType_placeholder, StructType):
                    raise TypeError
                to_arrow_type(self._returnType_placeholder)
            except TypeError:
                raise NotImplementedError(
                    "Invalid  return type with grouped aggregate Pandas UDFs: "
                    "%s is not supported" % str(self._returnType_placeholder))

        return self._returnType_placeholder