def wrap_bounded_window_agg_pandas_udf(f, return_type): arrow_return_type = to_arrow_type(return_type) def wrapped(begin_index, end_index, *series): import pandas as pd result = [] # Index operation is faster on np.ndarray, # So we turn the index series into np array # here for performance begin_array = begin_index.values end_array = end_index.values for i in range(len(begin_array)): # Note: Create a slice from a series for each window is # actually pretty expensive. However, there # is no easy way to reduce cost here. # Note: s.iloc[i : j] is about 30% faster than s[i: j], with # the caveat that the created slices shares the same # memory with s. Therefore, user are not allowed to # change the value of input series inside the window # function. It is rare that user needs to modify the # input series in the window function, and therefore, # it is be a reasonable restriction. # Note: Calling reset_index on the slices will increase the cost # of creating slices by about 100%. Therefore, for performance # reasons we don't do it here. series_slices = [ s.iloc[begin_array[i]:end_array[i]] for s in series ] result.append(f(*series_slices)) return pd.Series(result) return lambda *a: (wrapped(*a), arrow_return_type)
def wrap_grouped_map_pandas_udf(f, return_type, argspec): def wrapped(key_series, value_series): import pandas as pd if len(argspec.args) == 1: result = f(pd.concat(value_series, axis=1)) elif len(argspec.args) == 2: key = tuple(s[0] for s in key_series) result = f(key, pd.concat(value_series, axis=1)) if not isinstance(result, pd.DataFrame): raise TypeError( "Return type of the user-defined function should be " "pandas.DataFrame, but is {}".format(type(result)) ) # the number of columns of result have to match the return type # but it is fine for result to have no columns at all if it is empty if not ( len(result.columns) == len(return_type) or len(result.columns) == 0 and result.empty ): raise RuntimeError( "Number of columns of the returned pandas.DataFrame " "doesn't match specified schema. " "Expected: {} Actual: {}".format(len(return_type), len(result.columns)) ) return result return lambda k, v: [(wrapped(k, v), to_arrow_type(return_type))]
def wrap_cogrouped_map_pandas_udf(f, return_type, argspec): def wrapped(left_key_series, left_value_series, right_key_series, right_value_series): import pandas as pd left_df = pd.concat(left_value_series, axis=1) right_df = pd.concat(right_value_series, axis=1) if len(argspec.args) == 2: result = f(left_df, right_df) elif len(argspec.args) == 3: key_series = left_key_series if not left_df.empty else right_key_series key = tuple(s[0] for s in key_series) result = f(key, left_df, right_df) if not isinstance(result, pd.DataFrame): raise TypeError( "Return type of the user-defined function should be " "pandas.DataFrame, but is {}".format(type(result))) if not len(result.columns) == len(return_type): raise RuntimeError( "Number of columns of the returned pandas.DataFrame " "doesn't match specified schema. " "Expected: {} Actual: {}".format(len(return_type), len(result.columns))) return result return lambda kl, vl, kr, vr: [(wrapped(kl, vl, kr, vr), to_arrow_type(return_type))]
def wrap_grouped_agg_pandas_udf(f, return_type): arrow_return_type = to_arrow_type(return_type) def wrapped(*series): import pandas as pd result = f(*series) return pd.Series([result]) return lambda *a: (wrapped(*a), arrow_return_type)
def wrap_pandas_iter_udf(f, return_type): arrow_return_type = to_arrow_type(return_type) def verify_result_type(result): if not hasattr(result, "__len__"): pd_type = "Pandas.DataFrame" if type(return_type) == StructType else "Pandas.Series" raise TypeError("Return type of the user-defined function should be " "{}, but is {}".format(pd_type, type(result))) return result return lambda *iterator: map(lambda res: (res, arrow_return_type), map(verify_result_type, f(*iterator)))
def wrap_unbounded_window_agg_pandas_udf(f, return_type): # This is similar to grouped_agg_pandas_udf, the only difference # is that window_agg_pandas_udf needs to repeat the return value # to match window length, where grouped_agg_pandas_udf just returns # the scalar value. arrow_return_type = to_arrow_type(return_type) def wrapped(*series): import pandas as pd result = f(*series) return pd.Series([result]).repeat(len(series[0])) return lambda *a: (wrapped(*a), arrow_return_type)
def spark_type_to_pandas_dtype( spark_type: types.DataType, *, use_extension_dtypes: bool = False ) -> Dtype: """Return the given Spark DataType to pandas dtype.""" if use_extension_dtypes and extension_dtypes_available: # IntegralType if isinstance(spark_type, types.ByteType): return Int8Dtype() elif isinstance(spark_type, types.ShortType): return Int16Dtype() elif isinstance(spark_type, types.IntegerType): return Int32Dtype() elif isinstance(spark_type, types.LongType): return Int64Dtype() if extension_object_dtypes_available: # BooleanType if isinstance(spark_type, types.BooleanType): return BooleanDtype() # StringType elif isinstance(spark_type, types.StringType): return StringDtype() # FractionalType if extension_float_dtypes_available: if isinstance(spark_type, types.FloatType): return Float32Dtype() elif isinstance(spark_type, types.DoubleType): return Float64Dtype() if isinstance( spark_type, ( types.DateType, types.NullType, types.ArrayType, types.MapType, types.StructType, types.UserDefinedType, ), ): return np.dtype("object") elif isinstance(spark_type, types.TimestampType): return np.dtype("datetime64[ns]") else: return np.dtype(to_arrow_type(spark_type).to_pandas_dtype())
def wrap_scalar_pandas_udf(f, return_type): arrow_return_type = to_arrow_type(return_type) def verify_result_type(result): if not hasattr(result, "__len__"): pd_type = "Pandas.DataFrame" if type(return_type) == StructType else "Pandas.Series" raise TypeError("Return type of the user-defined function should be " "{}, but is {}".format(pd_type, type(result))) return result def verify_result_length(result, length): if len(result) != length: raise RuntimeError("Result vector from pandas_udf was not the required length: " "expected %d, got %d" % (length, len(result))) return result return lambda *a: (verify_result_length( verify_result_type(f(*a)), len(a[0])), arrow_return_type)
def _create_from_pandas_with_arrow(self, pdf, schema, timezone): """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from pyspark.sql import SparkSession from pyspark.sql.dataframe import DataFrame assert isinstance(self, SparkSession) from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer from pyspark.sql.types import TimestampType from pyspark.sql.pandas.types import from_arrow_type, to_arrow_type from pyspark.sql.pandas.utils import require_minimum_pandas_version, \ require_minimum_pyarrow_version require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype import pyarrow as pa # Create the Spark schema from list of names passed in with Arrow types if isinstance(schema, (list, tuple)): arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False) struct = StructType() prefer_timestamp_ntz = is_timestamp_ntz_preferred() for name, field in zip(schema, arrow_schema): struct.add(name, from_arrow_type(field.type, prefer_timestamp_ntz), nullable=field.nullable) schema = struct # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError( "Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [ to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes ] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism ) # round int up pdf_slices = (pdf.iloc[start:start + step] for start in range(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types) ] for pdf_slice in pdf_slices] jsqlContext = self._wrapped._jsqlContext safecheck = self._wrapped._conf.arrowSafeTypeConversion() col_by_name = True # col by name only applies to StructType columns, can't happen here ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name) def reader_func(temp_filename): return self._jvm.PythonSQLUtils.readArrowStreamFromFile( jsqlContext, temp_filename) def create_RDD_server(): return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server) jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df
def returnType(self) -> DataType: # This makes sure this is called after SparkContext is initialized. # ``_parse_datatype_string`` accesses to JVM for parsing a DDL formatted string. if self._returnType_placeholder is None: if isinstance(self._returnType, DataType): self._returnType_placeholder = self._returnType else: self._returnType_placeholder = _parse_datatype_string( self._returnType) if self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF or \ self.evalType == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF: try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type with scalar Pandas UDFs: %s is " "not supported" % str(self._returnType_placeholder)) elif self.evalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: if isinstance(self._returnType_placeholder, StructType): try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type with grouped map Pandas UDFs or " "at groupby.applyInPandas: %s is not supported" % str(self._returnType_placeholder)) else: raise TypeError( "Invalid return type for grouped map Pandas " "UDFs or at groupby.applyInPandas: return type must be a " "StructType.") elif self.evalType == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF: if isinstance(self._returnType_placeholder, StructType): try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type in mapInPandas: " "%s is not supported" % str(self._returnType_placeholder)) else: raise TypeError("Invalid return type in mapInPandas: " "return type must be a StructType.") elif self.evalType == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF: if isinstance(self._returnType_placeholder, StructType): try: to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type in cogroup.applyInPandas: " "%s is not supported" % str(self._returnType_placeholder)) else: raise TypeError( "Invalid return type in cogroup.applyInPandas: " "return type must be a StructType.") elif self.evalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF: try: # StructType is not yet allowed as a return type, explicitly check here to fail fast if isinstance(self._returnType_placeholder, StructType): raise TypeError to_arrow_type(self._returnType_placeholder) except TypeError: raise NotImplementedError( "Invalid return type with grouped aggregate Pandas UDFs: " "%s is not supported" % str(self._returnType_placeholder)) return self._returnType_placeholder