Python ArrowStreamPandasUDFSerializer Examples

Programming Language: Python

Namespace/Package Name: pyspark.serializers

Class/Type: ArrowStreamPandasUDFSerializer

Examples at hotexamples.com: 3

Python ArrowStreamPandasUDFSerializer - 3 examples found. These are the top rated real world Python examples of pyspark.serializers.ArrowStreamPandasUDFSerializer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ArrowStreamPandasUDFSerializer(3)

Example #1

Show file

def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get(
            "spark.sql.execution.pandas.arrowSafeTypeConversion",
            "false").lower() == 'true'
        # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        if eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
            ser = CogroupUDFSerializer(timezone, safecheck,
                                       assign_cols_by_name)
        else:
            # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
            # pandas Series. See SPARK-27240.
            df_for_struct = (
                eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF
                or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
                or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)
            ser = ArrowStreamPandasUDFSerializer(timezone, safecheck,
                                                 assign_cols_by_name,
                                                 df_for_struct)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)

    is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
    is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF

    if is_scalar_iter or is_map_iter:
        if is_scalar_iter:
            assert num_udfs == 1, "One SCALAR_ITER UDF expected here."
        if is_map_iter:
            assert num_udfs == 1, "One MAP_ITER UDF expected here."

        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)

        def func(_, iterator):
            num_input_rows = [0]

            def map_batch(batch):
                udf_args = [batch[offset] for offset in arg_offsets]
                num_input_rows[0] += len(udf_args[0])
                if len(udf_args) == 1:
                    return udf_args[0]
                else:
                    return tuple(udf_args)

            iterator = map(map_batch, iterator)
            result_iter = udf(iterator)

            num_output_rows = 0
            for result_batch, result_type in result_iter:
                num_output_rows += len(result_batch)
                assert is_map_iter or num_output_rows <= num_input_rows[0], \
                    "Pandas MAP_ITER UDF outputted more rows than input rows."
                yield (result_batch, result_type)

            if is_scalar_iter:
                try:
                    next(iterator)
                except StopIteration:
                    pass
                else:
                    raise RuntimeError(
                        "SQL_SCALAR_PANDAS_ITER_UDF should exhaust the input "
                        "iterator.")

            if is_scalar_iter and num_output_rows != num_input_rows[0]:
                raise RuntimeError(
                    "The number of output rows of pandas iterator UDF should be "
                    "the same with input rows. The input rows number is %d but the "
                    "output rows number is %d." %
                    (num_input_rows[0], num_output_rows))

        # profiling is not supported for UDF
        return func, None, ser, ser

    def extract_key_value_indexes(grouped_arg_offsets):
        """
        Helper function to extract the key and value indexes from arg_offsets for the grouped and
        cogrouped pandas udfs. See BasePandasGroupExec.resolveArgOffsets for equivalent scala code.

        :param grouped_arg_offsets:  List containing the key and value indexes of columns of the
            DataFrames to be passed to the udf. It consists of n repeating groups where n is the
            number of DataFrames.  Each group has the following format:
                group[0]: length of group
                group[1]: length of key indexes
                group[2.. group[1] +2]: key attributes
                group[group[1] +3 group[0]]: value attributes
        """
        parsed = []
        idx = 0
        while idx < len(grouped_arg_offsets):
            offsets_len = grouped_arg_offsets[idx]
            idx += 1
            offsets = grouped_arg_offsets[idx:idx + offsets_len]
            split_index = offsets[0] + 1
            offset_keys = offsets[1:split_index]
            offset_values = offsets[split_index:]
            parsed.append([offset_keys, offset_values])
            idx += offsets_len
        return parsed

    udfs = {}
    call_udf = []
    mapper_str = ""
    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # Create function like this:
        #   lambda a: f([a[0]], [a[0], a[1]])

        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)
        udfs['f'] = udf
        parsed_offsets = extract_key_value_indexes(arg_offsets)
        keys = ["a[%d]" % (o, ) for o in parsed_offsets[0][0]]
        vals = ["a[%d]" % (o, ) for o in parsed_offsets[0][1]]
        mapper_str = "lambda a: f([%s], [%s])" % (", ".join(keys),
                                                  ", ".join(vals))
    elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
        # We assume there is only one UDF here because cogrouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1
        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)
        udfs['f'] = udf
        parsed_offsets = extract_key_value_indexes(arg_offsets)
        df1_keys = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][0]]
        df1_vals = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][1]]
        df2_keys = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][0]]
        df2_vals = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][1]]
        mapper_str = "lambda a: f([%s], [%s], [%s], [%s])" % (
            ", ".join(df1_keys), ", ".join(df1_vals), ", ".join(df2_keys),
            ", ".join(df2_vals))
    else:
        # Create function like this:
        #   lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3]))
        # In the special case of a single UDF this will return a single result rather
        # than a tuple of results; this is the format that the JVM side expects.
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(pickleSer,
                                               infile,
                                               eval_type,
                                               runner_conf,
                                               udf_index=i)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))

    mapper = eval(mapper_str, udfs)
    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser

Example #2

Show file

def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get(
            "spark.sql.execution.pandas.arrowSafeTypeConversion",
            "false").lower() == 'true'
        # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
        # pandas Series. See SPARK-27240.
        df_for_struct = (
            eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF
            or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
            or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)
        ser = ArrowStreamPandasUDFSerializer(timezone, safecheck,
                                             assign_cols_by_name,
                                             df_for_struct)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)

    is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
    is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF

    if is_scalar_iter or is_map_iter:
        if is_scalar_iter:
            assert num_udfs == 1, "One SCALAR_ITER UDF expected here."
        if is_map_iter:
            assert num_udfs == 1, "One MAP_ITER UDF expected here."

        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)

        def func(_, iterator):
            num_input_rows = [0]

            def map_batch(batch):
                udf_args = [batch[offset] for offset in arg_offsets]
                num_input_rows[0] += len(udf_args[0])
                if len(udf_args) == 1:
                    return udf_args[0]
                else:
                    return tuple(udf_args)

            iterator = map(map_batch, iterator)
            result_iter = udf(iterator)

            num_output_rows = 0
            for result_batch, result_type in result_iter:
                num_output_rows += len(result_batch)
                assert is_map_iter or num_output_rows <= num_input_rows[0], \
                    "Pandas MAP_ITER UDF outputted more rows than input rows."
                yield (result_batch, result_type)

            if is_scalar_iter:
                try:
                    next(iterator)
                except StopIteration:
                    pass
                else:
                    raise RuntimeError(
                        "SQL_SCALAR_PANDAS_ITER_UDF should exhaust the input "
                        "iterator.")

            if is_scalar_iter and num_output_rows != num_input_rows[0]:
                raise RuntimeError(
                    "The number of output rows of pandas iterator UDF should be "
                    "the same with input rows. The input rows number is %d but the "
                    "output rows number is %d." %
                    (num_input_rows[0], num_output_rows))

        # profiling is not supported for UDF
        return func, None, ser, ser

    udfs = {}
    call_udf = []
    mapper_str = ""
    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # Create function like this:
        #   lambda a: f([a[0]], [a[0], a[1]])

        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)
        udfs['f'] = udf
        split_offset = arg_offsets[0] + 1
        arg0 = ["a[%d]" % o for o in arg_offsets[1:split_offset]]
        arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]]
        mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0),
                                                  ", ".join(arg1))
    else:
        # Create function like this:
        #   lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3]))
        # In the special case of a single UDF this will return a single result rather
        # than a tuple of results; this is the format that the JVM side expects.
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(pickleSer,
                                               infile,
                                               eval_type,
                                               runner_conf,
                                               udf_index=i)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))

    mapper = eval(mapper_str, udfs)
    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser

Example #3

Show file

def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get(
            "spark.sql.execution.pandas.arrowSafeTypeConversion",
            "false").lower() == 'true'
        # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
        # pandas Series. See SPARK-27240.
        df_for_struct = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF
        ser = ArrowStreamPandasUDFSerializer(timezone, safecheck,
                                             assign_cols_by_name,
                                             df_for_struct)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)
    udfs = {}
    call_udf = []
    mapper_str = ""
    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # Create function like this:
        #   lambda a: f([a[0]], [a[0], a[1]])

        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)
        udfs['f'] = udf
        split_offset = arg_offsets[0] + 1
        arg0 = ["a[%d]" % o for o in arg_offsets[1:split_offset]]
        arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]]
        mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0),
                                                  ", ".join(arg1))
    else:
        # Create function like this:
        #   lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3]))
        # In the special case of a single UDF this will return a single result rather
        # than a tuple of results; this is the format that the JVM side expects.
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(pickleSer,
                                               infile,
                                               eval_type,
                                               runner_conf,
                                               udf_index=i)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))

    mapper = eval(mapper_str, udfs)
    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser