Ejemplo n.º 1
0
def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get("spark.sql.execution.pandas.convertToArrowArraySafely",
                                    "false").lower() == 'true'
        # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        if eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
            ser = CogroupUDFSerializer(timezone, safecheck, assign_cols_by_name)
        else:
            # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
            # pandas Series. See SPARK-27240.
            df_for_struct = (eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF or
                             eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF or
                             eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)
            ser = ArrowStreamPandasUDFSerializer(timezone, safecheck, assign_cols_by_name,
                                                 df_for_struct)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)

    is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
    is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF

    if is_scalar_iter or is_map_iter:
        if is_scalar_iter:
            assert num_udfs == 1, "One SCALAR_ITER UDF expected here."
        if is_map_iter:
            assert num_udfs == 1, "One MAP_ITER UDF expected here."

        arg_offsets, udf = read_single_udf(
            pickleSer, infile, eval_type, runner_conf, udf_index=0)

        def func(_, iterator):
            num_input_rows = 0

            def map_batch(batch):
                nonlocal num_input_rows

                udf_args = [batch[offset] for offset in arg_offsets]
                num_input_rows += len(udf_args[0])
                if len(udf_args) == 1:
                    return udf_args[0]
                else:
                    return tuple(udf_args)

            iterator = map(map_batch, iterator)
            result_iter = udf(iterator)

            num_output_rows = 0
            for result_batch, result_type in result_iter:
                num_output_rows += len(result_batch)
                # This assert is for Scalar Iterator UDF to fail fast.
                # The length of the entire input can only be explicitly known
                # by consuming the input iterator in user side. Therefore,
                # it's very unlikely the output length is higher than
                # input length.
                assert is_map_iter or num_output_rows <= num_input_rows, \
                    "Pandas SCALAR_ITER UDF outputted more rows than input rows."
                yield (result_batch, result_type)

            if is_scalar_iter:
                try:
                    next(iterator)
                except StopIteration:
                    pass
                else:
                    raise RuntimeError("pandas iterator UDF should exhaust the input "
                                       "iterator.")

                if num_output_rows != num_input_rows:
                    raise RuntimeError(
                        "The length of output in Scalar iterator pandas UDF should be "
                        "the same with the input's; however, the length of output was %d and the "
                        "length of input was %d." % (num_output_rows, num_input_rows))

        # profiling is not supported for UDF
        return func, None, ser, ser

    def extract_key_value_indexes(grouped_arg_offsets):
        """
        Helper function to extract the key and value indexes from arg_offsets for the grouped and
        cogrouped pandas udfs. See BasePandasGroupExec.resolveArgOffsets for equivalent scala code.

        Parameters
        ----------
        grouped_arg_offsets:  list
            List containing the key and value indexes of columns of the
            DataFrames to be passed to the udf. It consists of n repeating groups where n is the
            number of DataFrames.  Each group has the following format:
                group[0]: length of group
                group[1]: length of key indexes
                group[2.. group[1] +2]: key attributes
                group[group[1] +3 group[0]]: value attributes
        """
        parsed = []
        idx = 0
        while idx < len(grouped_arg_offsets):
            offsets_len = grouped_arg_offsets[idx]
            idx += 1
            offsets = grouped_arg_offsets[idx: idx + offsets_len]
            split_index = offsets[0] + 1
            offset_keys = offsets[1: split_index]
            offset_values = offsets[split_index:]
            parsed.append([offset_keys, offset_values])
            idx += offsets_len
        return parsed

    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0)
        parsed_offsets = extract_key_value_indexes(arg_offsets)

        # Create function like this:
        #   mapper a: f([a[0]], [a[0], a[1]])
        def mapper(a):
            keys = [a[o] for o in parsed_offsets[0][0]]
            vals = [a[o] for o in parsed_offsets[0][1]]
            return f(keys, vals)
    elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
        # We assume there is only one UDF here because cogrouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1
        arg_offsets, f = read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=0)

        parsed_offsets = extract_key_value_indexes(arg_offsets)

        def mapper(a):
            df1_keys = [a[0][o] for o in parsed_offsets[0][0]]
            df1_vals = [a[0][o] for o in parsed_offsets[0][1]]
            df2_keys = [a[1][o] for o in parsed_offsets[1][0]]
            df2_vals = [a[1][o] for o in parsed_offsets[1][1]]
            return f(df1_keys, df1_vals, df2_keys, df2_vals)
    else:
        udfs = []
        for i in range(num_udfs):
            udfs.append(read_single_udf(pickleSer, infile, eval_type, runner_conf, udf_index=i))

        def mapper(a):
            result = tuple(f(*[a[o] for o in arg_offsets]) for (arg_offsets, f) in udfs)
            # In the special case of a single UDF this will return a single result rather
            # than a tuple of results; this is the format that the JVM side expects.
            if len(result) == 1:
                return result[0]
            else:
                return result

    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser
Ejemplo n.º 2
0
    def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None,
        environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None,
        gateway=None):
        """
        Create a new SparkContext. At least the master and app name should be set,
        either through the named parameters here or through C{conf}.

        @param master: Cluster URL to connect to
               (e.g. mesos://host:port, spark://host:port, local[4]).
        @param appName: A name for your job, to display on the cluster web UI.
        @param sparkHome: Location where Spark is installed on cluster nodes.
        @param pyFiles: Collection of .zip or .py files to send to the cluster
               and add to PYTHONPATH.  These can be paths on the local file
               system or HDFS, HTTP, HTTPS, or FTP URLs.
        @param environment: A dictionary of environment variables to set on
               worker nodes.
        @param batchSize: The number of Python objects represented as a single
               Java object.  Set 1 to disable batching or -1 to use an
               unlimited batch size.
        @param serializer: The serializer for RDDs.
        @param conf: A L{SparkConf} object setting Spark properties.
        @param gateway: Use an existing gateway and JVM, otherwise a new JVM
               will be instatiated.


        >>> from pyspark.context import SparkContext
        >>> sc = SparkContext('local', 'test')

        >>> sc2 = SparkContext('local', 'test2') # doctest: +IGNORE_EXCEPTION_DETAIL
        Traceback (most recent call last):
            ...
        ValueError:...
        """
        if rdd._extract_concise_traceback() is not None:
            self._callsite = rdd._extract_concise_traceback()
        else:
            tempNamedTuple = namedtuple("Callsite", "function file linenum")
            self._callsite = tempNamedTuple(function=None, file=None, linenum=None)
        SparkContext._ensure_initialized(self, gateway=gateway)

        self.environment = environment or {}
        self._conf = conf or SparkConf(_jvm=self._jvm)
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 1:
            self.serializer = self._unbatched_serializer
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.iteritems():
                self._conf.setExecutorEnv(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception("An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)
        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        # Create the Java SparkContext through Py4J
        self._jsc = self._initialize_context(self._conf._jconf)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
                self._jvm.java.util.ArrayList(),
                self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.append(root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                self._python_includes.append(filename)
                sys.path.append(path)
                if not dirname in sys.path:
                    sys.path.append(dirname)

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir).getAbsolutePath()
Ejemplo n.º 3
0
    def _do_init(self, master, appName, sparkHome, pyFiles, environment,
                 batchSize, serializer, conf, jsc, profiler_cls):
        self.environment = environment or {}
        self._conf = conf or SparkConf(_jvm=self._jvm)
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 0:
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.iteritems():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception(
                "An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)
        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        # Create the Java SparkContext through Py4J
        self._jsc = jsc or self._initialize_context(self._conf._jconf)

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
            self._jvm.java.util.ArrayList(),
            self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                if filename.lower().endswith(
                        "zip") or filename.lower().endswith("egg"):
                    self._python_includes.append(filename)
                    sys.path.insert(
                        1, os.path.join(SparkFiles.getRootDirectory(),
                                        filename))

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(
            self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(
                profiler_cls, dump_path)
        else:
            self.profiler_collector = None
Ejemplo n.º 4
0
    def _do_init(self, master, appName, sparkHome, pyFiles, environment,
                 batchSize, serializer, conf, jsc, profiler_cls):
        self.environment = environment or {}
        # java gateway must have been launched at this point.
        if conf is not None and conf._jconf is not None:
            # conf has been initialized in JVM properly, so use conf directly. This represent the
            # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is
            # created and then stopped, and we create a new SparkConf and new SparkContext again)
            self._conf = conf
        else:
            self._conf = SparkConf(_jvm=SparkContext._jvm)
            if conf is not None:
                for k, v in conf.getAll():
                    self._conf.set(k, v)

        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 0:
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.items():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception(
                "An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)

        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        self.environment["PYTHONHASHSEED"] = os.environ.get(
            "PYTHONHASHSEED", "0")

        # Create the Java SparkContext through Py4J
        self._jsc = jsc or self._initialize_context(self._conf._jconf)
        # Reset the SparkConf to the one actually used by the SparkContext in JVM.
        self._conf = SparkConf(_jconf=self._jsc.sc().conf())

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port)
        self._jsc.sc().register(self._javaAccumulator)

        self.pythonExec = self._jvm.scala.Option.apply(
            os.environ.get("PYSPARK_PYTHON"))
        self.pythonVer = "%d.%d" % sys.version_info[:2]

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = BroadcastPickleRegistry()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                try:
                    filepath = os.path.join(SparkFiles.getRootDirectory(),
                                            filename)
                    if not os.path.exists(filepath):
                        # In case of YARN with shell mode, 'spark.submit.pyFiles' files are
                        # not added via SparkContext.addFile. Here we check if the file exists,
                        # try to copy and then add it to the path. See SPARK-21945.
                        shutil.copyfile(path, filepath)
                    if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
                        self._python_includes.append(filename)
                        sys.path.insert(1, filepath)
                except Exception:
                    warnings.warn(
                        "Failed to add file [%s] speficied in 'spark.submit.pyFiles' to "
                        "Python path:\n  %s" % (path, "\n  ".join(sys.path)),
                        RuntimeWarning)

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(
            self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(
                profiler_cls, dump_path)
        else:
            self.profiler_collector = None

        # create a signal handler which would be invoked on receiving SIGINT
        def signal_handler(signal, frame):
            self.cancelAllJobs()
            raise KeyboardInterrupt()

        # see http://stackoverflow.com/questions/23206787/
        if isinstance(threading.current_thread(), threading._MainThread):
            signal.signal(signal.SIGINT, signal_handler)
Ejemplo n.º 5
0
 def _rdd(self):
     if self._lazy_rdd is None:
         jrdd = self._jdf.javaToPython()
         self._lazy_rdd = RDD(jrdd, self._sc,
                              BatchedSerializer(PickleSerializer()))
     return self._lazy_rdd
Ejemplo n.º 6
0
def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get(
            "spark.sql.execution.pandas.arrowSafeTypeConversion",
            "false").lower() == 'true'
        # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
        # pandas Series. See SPARK-27240.
        df_for_struct = (eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF
                         or eval_type
                         == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF)
        ser = ArrowStreamPandasUDFSerializer(timezone, safecheck,
                                             assign_cols_by_name,
                                             df_for_struct)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)

    if eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF:
        assert num_udfs == 1, "One SQL_SCALAR_PANDAS_ITER_UDF expected here."

        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)

        def func(_, iterator):
            num_input_rows = [0]

            def map_batch(batch):
                udf_args = [batch[offset] for offset in arg_offsets]
                num_input_rows[0] += len(udf_args[0])
                if len(udf_args) == 1:
                    return udf_args[0]
                else:
                    return tuple(udf_args)

            iterator = map(map_batch, iterator)
            result_iter = udf(iterator)

            num_output_rows = 0
            for result_batch, result_type in result_iter:
                num_output_rows += len(result_batch)
                assert num_output_rows <= num_input_rows[0], \
                    "Pandas SCALAR_ITER UDF outputted more rows than input rows."
                yield (result_batch, result_type)
            try:
                if sys.version >= '3':
                    iterator.__next__()
                else:
                    iterator.next()
            except StopIteration:
                pass
            else:
                raise RuntimeError(
                    "SQL_SCALAR_PANDAS_ITER_UDF should exhaust the input iterator."
                )

            if num_output_rows != num_input_rows[0]:
                raise RuntimeError(
                    "The number of output rows of pandas iterator UDF should be "
                    "the same with input rows. The input rows number is %d but the "
                    "output rows number is %d." %
                    (num_input_rows[0], num_output_rows))

        # profiling is not supported for UDF
        return func, None, ser, ser

    udfs = {}
    call_udf = []
    mapper_str = ""
    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # Create function like this:
        #   lambda a: f([a[0]], [a[0], a[1]])

        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)
        udfs['f'] = udf
        split_offset = arg_offsets[0] + 1
        arg0 = ["a[%d]" % o for o in arg_offsets[1:split_offset]]
        arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]]
        mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0),
                                                  ", ".join(arg1))
    else:
        # Create function like this:
        #   lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3]))
        # In the special case of a single UDF this will return a single result rather
        # than a tuple of results; this is the format that the JVM side expects.
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(pickleSer,
                                               infile,
                                               eval_type,
                                               runner_conf,
                                               udf_index=i)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))

    mapper = eval(mapper_str, udfs)
    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser
Ejemplo n.º 7
0
def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        ser = ArrowStreamPandasSerializer(timezone)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)
    udfs = {}
    call_udf = []
    mapper_str = ""
    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # Create function like this:
        #   lambda a: f([a[0]], [a[0], a[1]])

        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type,
                                           runner_conf)
        udfs['f'] = udf
        split_offset = arg_offsets[0] + 1
        arg0 = ["a[%d]" % o for o in arg_offsets[1:split_offset]]
        arg1 = ["a[%d]" % o for o in arg_offsets[split_offset:]]
        mapper_str = "lambda a: f([%s], [%s])" % (", ".join(arg0),
                                                  ", ".join(arg1))
    else:
        # Create function like this:
        #   lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3]))
        # In the special case of a single UDF this will return a single result rather
        # than a tuple of results; this is the format that the JVM side expects.
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(pickleSer, infile, eval_type,
                                               runner_conf)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))

    mapper = eval(mapper_str, udfs)
    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser
Ejemplo n.º 8
0
 def __init__(self, memory_limit, serializer=None):
     self.memory_limit = memory_limit
     self.local_dirs = _get_local_dirs("sort")
     self.serializer = serializer or BatchedSerializer(PickleSerializer(), 1024)
     self._spilled_bytes = 0
Ejemplo n.º 9
0
def read_udfs(pickleSer, infile, eval_type):
    runner_conf = {}

    if eval_type in (PythonEvalType.SQL_SCALAR_PANDAS_UDF,
                     PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_MAP_PANDAS_ITER_UDF,
                     PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
                     PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF,
                     PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF):

        # Load conf used for pandas_udf evaluation
        num_conf = read_int(infile)
        for i in range(num_conf):
            k = utf8_deserializer.loads(infile)
            v = utf8_deserializer.loads(infile)
            runner_conf[k] = v

        # NOTE: if timezone is set here, that implies respectSessionTimeZone is True
        timezone = runner_conf.get("spark.sql.session.timeZone", None)
        safecheck = runner_conf.get(
            "spark.sql.execution.pandas.arrowSafeTypeConversion",
            "false").lower() == 'true'
        # Used by SQL_GROUPED_MAP_PANDAS_UDF and SQL_SCALAR_PANDAS_UDF when returning StructType
        assign_cols_by_name = runner_conf.get(
            "spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName", "true")\
            .lower() == "true"

        if eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
            ser = CogroupUDFSerializer(timezone, safecheck,
                                       assign_cols_by_name)
        else:
            # Scalar Pandas UDF handles struct type arguments as pandas DataFrames instead of
            # pandas Series. See SPARK-27240.
            df_for_struct = (
                eval_type == PythonEvalType.SQL_SCALAR_PANDAS_UDF
                or eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
                or eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF)
            ser = ArrowStreamPandasUDFSerializer(timezone, safecheck,
                                                 assign_cols_by_name,
                                                 df_for_struct)
    else:
        ser = BatchedSerializer(PickleSerializer(), 100)

    num_udfs = read_int(infile)

    is_scalar_iter = eval_type == PythonEvalType.SQL_SCALAR_PANDAS_ITER_UDF
    is_map_iter = eval_type == PythonEvalType.SQL_MAP_PANDAS_ITER_UDF

    if is_scalar_iter or is_map_iter:
        if is_scalar_iter:
            assert num_udfs == 1, "One SCALAR_ITER UDF expected here."
        if is_map_iter:
            assert num_udfs == 1, "One MAP_ITER UDF expected here."

        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)

        def func(_, iterator):
            num_input_rows = [0]

            def map_batch(batch):
                udf_args = [batch[offset] for offset in arg_offsets]
                num_input_rows[0] += len(udf_args[0])
                if len(udf_args) == 1:
                    return udf_args[0]
                else:
                    return tuple(udf_args)

            iterator = map(map_batch, iterator)
            result_iter = udf(iterator)

            num_output_rows = 0
            for result_batch, result_type in result_iter:
                num_output_rows += len(result_batch)
                assert is_map_iter or num_output_rows <= num_input_rows[0], \
                    "Pandas MAP_ITER UDF outputted more rows than input rows."
                yield (result_batch, result_type)

            if is_scalar_iter:
                try:
                    next(iterator)
                except StopIteration:
                    pass
                else:
                    raise RuntimeError(
                        "SQL_SCALAR_PANDAS_ITER_UDF should exhaust the input "
                        "iterator.")

            if is_scalar_iter and num_output_rows != num_input_rows[0]:
                raise RuntimeError(
                    "The number of output rows of pandas iterator UDF should be "
                    "the same with input rows. The input rows number is %d but the "
                    "output rows number is %d." %
                    (num_input_rows[0], num_output_rows))

        # profiling is not supported for UDF
        return func, None, ser, ser

    def extract_key_value_indexes(grouped_arg_offsets):
        """
        Helper function to extract the key and value indexes from arg_offsets for the grouped and
        cogrouped pandas udfs. See BasePandasGroupExec.resolveArgOffsets for equivalent scala code.

        :param grouped_arg_offsets:  List containing the key and value indexes of columns of the
            DataFrames to be passed to the udf. It consists of n repeating groups where n is the
            number of DataFrames.  Each group has the following format:
                group[0]: length of group
                group[1]: length of key indexes
                group[2.. group[1] +2]: key attributes
                group[group[1] +3 group[0]]: value attributes
        """
        parsed = []
        idx = 0
        while idx < len(grouped_arg_offsets):
            offsets_len = grouped_arg_offsets[idx]
            idx += 1
            offsets = grouped_arg_offsets[idx:idx + offsets_len]
            split_index = offsets[0] + 1
            offset_keys = offsets[1:split_index]
            offset_values = offsets[split_index:]
            parsed.append([offset_keys, offset_values])
            idx += offsets_len
        return parsed

    udfs = {}
    call_udf = []
    mapper_str = ""
    if eval_type == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF:
        # Create function like this:
        #   lambda a: f([a[0]], [a[0], a[1]])

        # We assume there is only one UDF here because grouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1

        # See FlatMapGroupsInPandasExec for how arg_offsets are used to
        # distinguish between grouping attributes and data attributes
        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)
        udfs['f'] = udf
        parsed_offsets = extract_key_value_indexes(arg_offsets)
        keys = ["a[%d]" % (o, ) for o in parsed_offsets[0][0]]
        vals = ["a[%d]" % (o, ) for o in parsed_offsets[0][1]]
        mapper_str = "lambda a: f([%s], [%s])" % (", ".join(keys),
                                                  ", ".join(vals))
    elif eval_type == PythonEvalType.SQL_COGROUPED_MAP_PANDAS_UDF:
        # We assume there is only one UDF here because cogrouped map doesn't
        # support combining multiple UDFs.
        assert num_udfs == 1
        arg_offsets, udf = read_single_udf(pickleSer,
                                           infile,
                                           eval_type,
                                           runner_conf,
                                           udf_index=0)
        udfs['f'] = udf
        parsed_offsets = extract_key_value_indexes(arg_offsets)
        df1_keys = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][0]]
        df1_vals = ["a[0][%d]" % (o, ) for o in parsed_offsets[0][1]]
        df2_keys = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][0]]
        df2_vals = ["a[1][%d]" % (o, ) for o in parsed_offsets[1][1]]
        mapper_str = "lambda a: f([%s], [%s], [%s], [%s])" % (
            ", ".join(df1_keys), ", ".join(df1_vals), ", ".join(df2_keys),
            ", ".join(df2_vals))
    else:
        # Create function like this:
        #   lambda a: (f0(a[0]), f1(a[1], a[2]), f2(a[3]))
        # In the special case of a single UDF this will return a single result rather
        # than a tuple of results; this is the format that the JVM side expects.
        for i in range(num_udfs):
            arg_offsets, udf = read_single_udf(pickleSer,
                                               infile,
                                               eval_type,
                                               runner_conf,
                                               udf_index=i)
            udfs['f%d' % i] = udf
            args = ["a[%d]" % o for o in arg_offsets]
            call_udf.append("f%d(%s)" % (i, ", ".join(args)))
        mapper_str = "lambda a: (%s)" % (", ".join(call_udf))

    mapper = eval(mapper_str, udfs)
    func = lambda _, it: map(mapper, it)

    # profiling is not supported for UDF
    return func, None, ser, ser
Ejemplo n.º 10
0
    def _do_init(self, master, appName, sparkHome, pyFiles, environment,
                 batchSize, serializer, conf, jsc, profiler_cls):
        self.environment = environment or {}
        self._conf = conf or SparkConf(_jvm=self._jvm)
        self._batchSize = batchSize  # -1 represents an unlimited batch size
        self._unbatched_serializer = serializer
        if batchSize == 0:
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf
        if master:
            self._conf.setMaster(master)
        if appName:
            self._conf.setAppName(appName)
        if sparkHome:
            self._conf.setSparkHome(sparkHome)
        if environment:
            for key, value in environment.items():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters
        if not self._conf.contains("spark.master"):
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):
            raise Exception(
                "An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)

        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v
        if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ:
            # disable randomness of hash of string in worker, if this is not
            # launched by spark-submit
            self.environment["PYTHONHASHSEED"] = "0"

        # Create the Java SparkContext through Py4J
        self._jsc = jsc or self._initialize_context(self._conf._jconf)
        # Reset the SparkConf to the one actually used by the SparkContext in JVM.
        self._conf = SparkConf(_jconf=self._jsc.sc().conf())

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jsc.accumulator(
            self._jvm.java.util.ArrayList(),
            self._jvm.PythonAccumulatorParam(host, port))

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')
        self.pythonVer = "%d.%d" % sys.version_info[:2]

        # Broadcast's __reduce__ method stores Broadcast instances here.
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
        self._pickled_broadcast_vars = set()

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)

        # Deploy any code dependencies specified in the constructor
        self._python_includes = list()
        for path in (pyFiles or []):
            self.addPyFile(path)

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):
            if path != "":
                (dirname, filename) = os.path.split(path)
                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:
                    self._python_includes.append(filename)
                    sys.path.insert(
                        1, os.path.join(SparkFiles.getRootDirectory(),
                                        filename))

        # Create a temporary directory inside spark.local.dir:
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(
            self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(
                profiler_cls, dump_path)
        else:
            self.profiler_collector = None

        # create a signal handler which would be invoked on receiving SIGINT
        def signal_handler(signal, frame):
            self.cancelAllJobs()
            raise KeyboardInterrupt()

        # see http://stackoverflow.com/questions/23206787/
        if isinstance(threading.current_thread(), threading._MainThread):
            signal.signal(signal.SIGINT, signal_handler)
Ejemplo n.º 11
0
 def func(sc, *a, **kw):
     jrdd = f(sc, *a, **kw)
     return RDD(sc._jvm.SerDe.javaToPython(jrdd), sc,
                BatchedSerializer(PickleSerializer(), 1024))
Ejemplo n.º 12
0
    def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer,
                 conf, jsc, profiler_cls):
        self.environment = environment or {}#要在工作节点上设置的环境变量
        # java gateway must have been launched at this point.
		#这个时候java的网关必须已经启动
        if conf is not None and conf._jconf is not None:
            # conf has been initialized in JVM properly, so use conf directly. This represent the
            # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is
            # created and then stopped, and we create a new SparkConf and new SparkContext again)
			#conf已在JVM上被正确初始化,所以直接使用的conf。
			#这说明在创建SparkConf之前已经启动了JVM(比如SparkContext被创建了,然后又被停止了,那么我们再创建一个新的SparkConf和新的SparkContext)
            self._conf = conf
        else:
            self._conf = SparkConf(_jvm=SparkContext._jvm)
            if conf is not None:
                for k, v in conf.getAll():
                    self._conf.set(k, v)

        self._batchSize = batchSize  # -1 represents an unlimited batch size :-1表示没有限制块大小
        self._unbatched_serializer = serializer #没有分块的序列化对象
        if batchSize == 0:#如果块大小为0(不知道分块大小)则将序列化对象自动分块,否则为已分块的序列化对象
            self.serializer = AutoBatchedSerializer(self._unbatched_serializer)
        else:
            self.serializer = BatchedSerializer(self._unbatched_serializer,
                                                batchSize)

        # Set any parameters passed directly to us on the conf 设置在conf上直接传给我们的参数
        if master:
            self._conf.setMaster(master)#配置master属性
        if appName:
            self._conf.setAppName(appName)#配置appName属性
        if sparkHome:
            self._conf.setSparkHome(sparkHome)#配置sparkHome属性
        if environment:#配置环境
            for key, value in environment.items():
                self._conf.setExecutorEnv(key, value)
        for key, value in DEFAULT_CONFIGS.items():
            self._conf.setIfMissing(key, value)

        # Check that we have at least the required parameters 
		#检查我们是否有必须具备的参数,从这里可以看出master和appName属性是最小配置,是必须要配置的
        if not self._conf.contains("spark.master"):#检查是否配置了master属性,如果没有就抛出异常提示
            raise Exception("A master URL must be set in your configuration")
        if not self._conf.contains("spark.app.name"):#检查是否配置了appName属性,如果没有就抛出异常提示
            raise Exception("An application name must be set in your configuration")

        # Read back our properties from the conf in case we loaded some of them from
        # the classpath or an external config file
		#从conf中读回我们需要的属性,以防我们从其他的一些classpath或者外部的配置文件中加载这些属性并被覆盖
        self.master = self._conf.get("spark.master")
        self.appName = self._conf.get("spark.app.name")
        self.sparkHome = self._conf.get("spark.home", None)

        for (k, v) in self._conf.getAll():
            if k.startswith("spark.executorEnv."):
                varName = k[len("spark.executorEnv."):]
                self.environment[varName] = v

        self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0")

        # Create the Java SparkContext through Py4J
		#通过Py4J创建Java SparkContext
        self._jsc = jsc or self._initialize_context(self._conf._jconf)
        # Reset the SparkConf to the one actually used by the SparkContext in JVM.
		#将SparkConf重置为JVM中SparkContext实际使用的那个。
        self._conf = SparkConf(_jconf=self._jsc.sc().conf())

        # Create a single Accumulator in Java that we'll send all our updates through;
        # they will be passed back to us through a TCP server
		#用Java创建一个单一的Accumulator,以便我们发送所有更新
		#他们将通过TCP服务器传回给我们
        self._accumulatorServer = accumulators._start_update_server()
        (host, port) = self._accumulatorServer.server_address
        self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port)
        self._jsc.sc().register(self._javaAccumulator)

        self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')#python执行器的目录
        self.pythonVer = "%d.%d" % sys.version_info[:2]#python的版本

        if sys.version_info < (2, 7):#如果python版本小于2.7则给出警告:提示spark2.0.0已经不支持python2.6了
            warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0")

        # Broadcast's __reduce__ method stores Broadcast instances here.广播的__reduce__方法在这里存储广播实例。
        # This allows other code to determine which Broadcast instances have
        # been pickled, so it can determine which Java broadcast objects to
        # send.
		#它允许其他代码来确定哪些广播实例已经被序列化了,所以它可以确定发送哪个Java广播对象
        self._pickled_broadcast_vars = BroadcastPickleRegistry()#在Thread-local中注册已被序列化的广播变量

        SparkFiles._sc = self
        root_dir = SparkFiles.getRootDirectory()
        sys.path.insert(1, root_dir)#sys.path是个列表,用sys.path.append在末尾添加目录,当这个append执行完之后,新目录即时起效

        # Deploy any code dependencies specified in the constructor
		#部署构造函数中指定的任何代码依赖关系
        self._python_includes = list()
        for path in (pyFiles or []):#为以后在这个SparkContext上执行的所有任务添加需要的.py或.zip依赖项。						
            self.addPyFile(path)#{path}可以是本地的文件,HDFS(或其他Hadoop支持的文件系统)中的文件,或者HTTP,HTTPS或FTP URI。

        # Deploy code dependencies set by spark-submit; these will already have been added
        # with SparkContext.addFile, so we just need to add them to the PYTHONPATH
		#部署由spark-submit设置的代码依赖项; 
		#这些依赖需要已经被SparkContext.addFile方法添加进去了,所以我们只需要将它们添加到PYTHONPATH中就可以了
        for path in self._conf.get("spark.submit.pyFiles", "").split(","):#通过逗号分割所需要的文件目录
            if path != "":#如果文件不为空,则分离出文件的目录和文件名字
                (dirname, filename) = os.path.split(path)
                if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:#如果文件后缀在('.zip', '.egg', '.jar')中,则将该文件名添加到相关依赖的列表中
                    self._python_includes.append(filename)
                    sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))#把下载后的文件(目录/文件名)添加到第1个位置上

        # Create a temporary directory inside spark.local.dir:
		#在spark.local.dir中创建一个临时目录
        local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf())
        self._temp_dir = \
            self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \
                .getAbsolutePath()

        # profiling stats collected for each PythonRDD
		#为每个PythonRDD解析stats
        if self._conf.get("spark.python.profile", "false") == "true":
            dump_path = self._conf.get("spark.python.profile.dump", None)
            self.profiler_collector = ProfilerCollector(profiler_cls, dump_path)#在每个基础的stage中跟踪不同的解析器。
        else:
            self.profiler_collector = None

        # create a signal handler which would be invoked on receiving SIGINT
		#创建一个信号处理程序,它将在接收到SIGINT时被调用
        def signal_handler(signal, frame):
            self.cancelAllJobs()#取消已安排或正在运行的所有作业
            raise KeyboardInterrupt()#抛出用户中断执行的异常

        # see http://stackoverflow.com/questions/23206787/
        if isinstance(threading.current_thread(), threading._MainThread):
            signal.signal(signal.SIGINT, signal_handler)#预设信号处理函数singnal.signal(signalnum, handler)其中第一个参数是信号量,第二个参数信号处理函数。