def read_udfs(pickleSer, infile, eval_type): num_udfs = read_int(infile) udfs = {} call_udf = [] for i in range(num_udfs): arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type) udfs['f%d' % i] = udf args = ["a[%d]" % o for o in arg_offsets] call_udf.append("f%d(%s)" % (i, ", ".join(args))) # Create function like this: # lambda a: (f0(a0), f1(a1, a2), f2(a3)) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF): timezone = utf8_deserializer.loads(infile) ser = ArrowStreamPandasSerializer(timezone) else: ser = BatchedSerializer(PickleSerializer(), 100) # profiling is not supported for UDF return func, None, ser, ser
def read_udfs(pickleSer, infile, eval_type): runner_conf = {} if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF, PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF): # Load conf used for pandas_udf evaluation num_conf = read_int(infile) for i in range(num_conf): k = utf8_deserializer.loads(infile) v = utf8_deserializer.loads(infile) runner_conf[k] = v # NOTE: if timezone is set here, that implies respectSessionTimeZone is True timezone = runner_conf.get("spark.sql.session.timeZone", None) ser = ArrowStreamPandasSerializer(timezone) else: ser = BatchedSerializer(PickleSerializer(), 100) num_udfs = read_int(infile) udfs = {} call_udf = [] mapper_str = "" if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF: # Create function like this: # lambda a: f([a[0]], [a[0], a[1]]) # We assume there is only one UDF here because grouped map doesn't # support combining multiple UDFs. assert num_udfs == 1 # See FlatMapGroupsInPandasExec for how arg_offsets are used to # distinguish between grouping attributes and data attributes arg_offsets, udf = read_single_udf( pickleSer, infile, eval_type, runner_conf, udf_index=0) udfs['f'] = udf split_offset = arg_offsets[0] + 1 arg0 = ["a[%d]" % o for o in arg_offsets[1: split_offset]] arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]] mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0), ", ".join(arg1)) else: # Create function like this: # lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3])) # In the special case of a single UDF this will return a single result rather # than a tuple of results; this is the format that the JVM side expects. for i in range(num_udfs): arg_offsets, udf = read_single_udf( pickleSer, infile, eval_type, runner_conf, udf_index=i) udfs['f%d' % i] = udf args = ["a[%d]" % o for o in arg_offsets] call_udf.append("f%d(%s)" % (i, ", ".join(args))) mapper_str = "lambda a: (%s)" % (", ".join(call_udf)) mapper = eval(mapper_str, udfs) func = lambda _, it: map(mapper, it) # profiling is not supported for UDF return func, None, ser, ser
def _create_from_pandas_with_arrow(self, pdf, schema, timezone): """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from distutils.version import LooseVersion from pyspark.serializers import ArrowStreamPandasSerializer from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType from pyspark.sql.utils import require_minimum_pandas_version, \ require_minimum_pyarrow_version require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype import pyarrow as pa # Create the Spark schema from list of names passed in with Arrow types if isinstance(schema, (list, tuple)): if LooseVersion(pa.__version__) < LooseVersion("0.12.0"): temp_batch = pa.RecordBatch.from_pandas(pdf[0:100], preserve_index=False) arrow_schema = temp_batch.schema else: arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False) struct = StructType() for name, field in zip(schema, arrow_schema): struct.add(name, from_arrow_type(field.type), nullable=field.nullable) schema = struct # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError("Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)] for pdf_slice in pdf_slices] jsqlContext = self._wrapped._jsqlContext safecheck = self._wrapped._conf.arrowSafeTypeConversion() col_by_name = True # col by name only applies to StructType columns, can't happen here ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name) def reader_func(temp_filename): return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename) def create_RDD_server(): return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server) jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df