def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls): self.environment = environment or {} # java gateway must have been launched at this point. if conf is not None and conf._jconf is not None: # conf has been initialized in JVM properly, so use conf directly. This represent the # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is # created and then stopped, and we create a new SparkConf and new SparkContext again) self._conf = conf else: self._conf = SparkConf(_jvm=SparkContext._jvm) self._batchSize = batchSize # -1 represents an unlimited batch size self._unbatched_serializer = serializer if batchSize == 0: self.serializer = AutoBatchedSerializer(self._unbatched_serializer) else: self.serializer = BatchedSerializer(self._unbatched_serializer, batchSize) # Set any parameters passed directly to us on the conf if master: self._conf.setMaster(master) if appName: self._conf.setAppName(appName) if sparkHome: self._conf.setSparkHome(sparkHome) if environment: for key, value in environment.items(): self._conf.setExecutorEnv(key, value) for key, value in DEFAULT_CONFIGS.items(): self._conf.setIfMissing(key, value) # Check that we have at least the required parameters if not self._conf.contains("spark.master"): raise Exception("A master URL must be set in your configuration") if not self._conf.contains("spark.app.name"): raise Exception("An application name must be set in your configuration") # Read back our properties from the conf in case we loaded some of them from # the classpath or an external config file self.master = self._conf.get("spark.master") self.appName = self._conf.get("spark.app.name") self.sparkHome = self._conf.get("spark.home", None) for (k, v) in self._conf.getAll(): if k.startswith("spark.executorEnv."): varName = k[len("spark.executorEnv."):] self.environment[varName] = v if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ: # disable randomness of hash of string in worker, if this is not # launched by spark-submit self.environment["PYTHONHASHSEED"] = "0" # Create the Java SparkContext through Py4J self._jsc = jsc or self._initialize_context(self._conf._jconf) # Reset the SparkConf to the one actually used by the SparkContext in JVM. self._conf = SparkConf(_jconf=self._jsc.sc().conf()) # Create a single Accumulator in Java that we'll send all our updates through; # they will be passed back to us through a TCP server self._accumulatorServer = accumulators._start_update_server() (host, port) = self._accumulatorServer.server_address self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port) self._jsc.sc().register(self._javaAccumulator) self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python') self.pythonVer = "%d.%d" % sys.version_info[:2] if sys.version_info < (2, 7): warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0") # Broadcast's __reduce__ method stores Broadcast instances here. # This allows other code to determine which Broadcast instances have # been pickled, so it can determine which Java broadcast objects to # send. self._pickled_broadcast_vars = set() SparkFiles._sc = self root_dir = SparkFiles.getRootDirectory() sys.path.insert(1, root_dir) # Deploy any code dependencies specified in the constructor self._python_includes = list() for path in (pyFiles or []): self.addPyFile(path) # Deploy code dependencies set by spark-submit; these will already have been added # with SparkContext.addFile, so we just need to add them to the PYTHONPATH for path in self._conf.get("spark.submit.pyFiles", "").split(","): if path != "": (dirname, filename) = os.path.split(path) if filename[-4:].lower() in self.PACKAGE_EXTENSIONS: self._python_includes.append(filename) sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename)) # Create a temporary directory inside spark.local.dir: local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf()) self._temp_dir = \ self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \ .getAbsolutePath() # profiling stats collected for each PythonRDD if self._conf.get("spark.python.profile", "false") == "true": dump_path = self._conf.get("spark.python.profile.dump", None) self.profiler_collector = ProfilerCollector(profiler_cls, dump_path) else: self.profiler_collector = None # create a signal handler which would be invoked on receiving SIGINT def signal_handler(signal, frame): self.cancelAllJobs() raise KeyboardInterrupt() # see http://stackoverflow.com/questions/23206787/ if isinstance(threading.current_thread(), threading._MainThread): signal.signal(signal.SIGINT, signal_handler)
def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls): self.environment = environment or {} # java gateway must have been launched at this point. if conf is not None and conf._jconf is not None: # conf has been initialized in JVM properly, so use conf directly. This represents the # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is # created and then stopped, and we create a new SparkConf and new SparkContext again) self._conf = conf else: self._conf = SparkConf(_jvm=SparkContext._jvm) if conf is not None: for k, v in conf.getAll(): self._conf.set(k, v) self._batchSize = batchSize # -1 represents an unlimited batch size self._unbatched_serializer = serializer if batchSize == 0: self.serializer = AutoBatchedSerializer(self._unbatched_serializer) else: self.serializer = BatchedSerializer(self._unbatched_serializer, batchSize) # Set any parameters passed directly to us on the conf if master: self._conf.setMaster(master) if appName: self._conf.setAppName(appName) if sparkHome: self._conf.setSparkHome(sparkHome) if environment: for key, value in environment.items(): self._conf.setExecutorEnv(key, value) for key, value in DEFAULT_CONFIGS.items(): self._conf.setIfMissing(key, value) # Check that we have at least the required parameters if not self._conf.contains("spark.master"): raise Exception("A master URL must be set in your configuration") if not self._conf.contains("spark.app.name"): raise Exception( "An application name must be set in your configuration") # Read back our properties from the conf in case we loaded some of them from # the classpath or an external config file self.master = self._conf.get("spark.master") self.appName = self._conf.get("spark.app.name") self.sparkHome = self._conf.get("spark.home", None) for (k, v) in self._conf.getAll(): if k.startswith("spark.executorEnv."): varName = k[len("spark.executorEnv."):] self.environment[varName] = v self.environment["PYTHONHASHSEED"] = os.environ.get( "PYTHONHASHSEED", "0") # Create the Java SparkContext through Py4J self._jsc = jsc or self._initialize_context(self._conf._jconf) # Reset the SparkConf to the one actually used by the SparkContext in JVM. self._conf = SparkConf(_jconf=self._jsc.sc().conf()) # Create a single Accumulator in Java that we'll send all our updates through; # they will be passed back to us through a TCP server auth_token = self._gateway.gateway_parameters.auth_token self._accumulatorServer = accumulators._start_update_server(auth_token) (host, port) = self._accumulatorServer.server_address self._javaAccumulator = self._jvm.PythonAccumulatorV2( host, port, auth_token) self._jsc.sc().register(self._javaAccumulator) # If encryption is enabled, we need to setup a server in the jvm to read broadcast # data via a socket. # scala's mangled names w/ $ in them require special treatment. self._encryption_enabled = self._jvm.PythonUtils.isEncryptionEnabled( self._jsc) self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python') self.pythonVer = "%d.%d" % sys.version_info[:2] if sys.version_info < (3, 0): with warnings.catch_warnings(): warnings.simplefilter("once") warnings.warn( "Support for Python 2 is deprecated as of Spark 3.0. " "See the plan for dropping Python 2 support at " "https://spark.apache.org/news/plan-for-dropping-python-2-support.html.", DeprecationWarning) # Broadcast's __reduce__ method stores Broadcast instances here. # This allows other code to determine which Broadcast instances have # been pickled, so it can determine which Java broadcast objects to # send. self._pickled_broadcast_vars = BroadcastPickleRegistry() SparkFiles._sc = self root_dir = SparkFiles.getRootDirectory() sys.path.insert(1, root_dir) # Deploy any code dependencies specified in the constructor self._python_includes = list() for path in (pyFiles or []): self.addPyFile(path) # Deploy code dependencies set by spark-submit; these will already have been added # with SparkContext.addFile, so we just need to add them to the PYTHONPATH for path in self._conf.get("spark.submit.pyFiles", "").split(","): if path != "": (dirname, filename) = os.path.split(path) try: filepath = os.path.join(SparkFiles.getRootDirectory(), filename) if not os.path.exists(filepath): # In case of YARN with shell mode, 'spark.submit.pyFiles' files are # not added via SparkContext.addFile. Here we check if the file exists, # try to copy and then add it to the path. See SPARK-21945. shutil.copyfile(path, filepath) if filename[-4:].lower() in self.PACKAGE_EXTENSIONS: self._python_includes.append(filename) sys.path.insert(1, filepath) except Exception: warnings.warn( "Failed to add file [%s] speficied in 'spark.submit.pyFiles' to " "Python path:\n %s" % (path, "\n ".join(sys.path)), RuntimeWarning) # Create a temporary directory inside spark.local.dir: local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir( self._jsc.sc().conf()) self._temp_dir = \ self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \ .getAbsolutePath() # profiling stats collected for each PythonRDD if self._conf.get("spark.python.profile", "false") == "true": dump_path = self._conf.get("spark.python.profile.dump", None) self.profiler_collector = ProfilerCollector( profiler_cls, dump_path) else: self.profiler_collector = None # create a signal handler which would be invoked on receiving SIGINT def signal_handler(signal, frame): self.cancelAllJobs() raise KeyboardInterrupt() # see http://stackoverflow.com/questions/23206787/ if isinstance(threading.current_thread(), threading._MainThread): signal.signal(signal.SIGINT, signal_handler)
def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls): self.environment = environment or {} self._conf = conf or SparkConf(_jvm=self._jvm) self._batchSize = batchSize # -1 represents an unlimited batch size self._unbatched_serializer = serializer if batchSize == 0: self.serializer = AutoBatchedSerializer(self._unbatched_serializer) else: self.serializer = BatchedSerializer(self._unbatched_serializer, batchSize) # Set any parameters passed directly to us on the conf if master: self._conf.setMaster(master) if appName: self._conf.setAppName(appName) if sparkHome: self._conf.setSparkHome(sparkHome) if environment: for key, value in environment.items(): self._conf.setExecutorEnv(key, value) for key, value in DEFAULT_CONFIGS.items(): self._conf.setIfMissing(key, value) # Check that we have at least the required parameters if not self._conf.contains("spark.master"): raise Exception("A master URL must be set in your configuration") if not self._conf.contains("spark.app.name"): raise Exception("An application name must be set in your configuration") # Read back our properties from the conf in case we loaded some of them from # the classpath or an external config file self.master = self._conf.get("spark.master") self.appName = self._conf.get("spark.app.name") self.sparkHome = self._conf.get("spark.home", None) for (k, v) in self._conf.getAll(): if k.startswith("spark.executorEnv."): varName = k[len("spark.executorEnv."):] self.environment[varName] = v if sys.version >= '3.3' and 'PYTHONHASHSEED' not in os.environ: # disable randomness of hash of string in worker, if this is not # launched by spark-submit self.environment["PYTHONHASHSEED"] = "0" # Create the Java SparkContext through Py4J self._jsc = jsc or self._initialize_context(self._conf._jconf) # Create a single Accumulator in Java that we'll send all our updates through; # they will be passed back to us through a TCP server self._accumulatorServer = accumulators._start_update_server() (host, port) = self._accumulatorServer.server_address self._javaAccumulator = self._jsc.accumulator( self._jvm.java.util.ArrayList(), self._jvm.PythonAccumulatorParam(host, port)) self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python') # Broadcast's __reduce__ method stores Broadcast instances here. # This allows other code to determine which Broadcast instances have # been pickled, so it can determine which Java broadcast objects to # send. self._pickled_broadcast_vars = set() SparkFiles._sc = self root_dir = SparkFiles.getRootDirectory() sys.path.insert(1, root_dir) # Deploy any code dependencies specified in the constructor self._python_includes = list() for path in (pyFiles or []): self.addPyFile(path) # Deploy code dependencies set by spark-submit; these will already have been added # with SparkContext.addFile, so we just need to add them to the PYTHONPATH for path in self._conf.get("spark.submit.pyFiles", "").split(","): if path != "": (dirname, filename) = os.path.split(path) if filename[-4:].lower() in self.PACKAGE_EXTENSIONS: self._python_includes.append(filename) sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename)) # Create a temporary directory inside spark.local.dir: local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf()) self._temp_dir = \ self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \ .getAbsolutePath() # profiling stats collected for each PythonRDD if self._conf.get("spark.python.profile", "false") == "true": dump_path = self._conf.get("spark.python.profile.dump", None) self.profiler_collector = ProfilerCollector(profiler_cls, dump_path) else: self.profiler_collector = None
def _do_init(self, master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf, jsc, profiler_cls): self.environment = environment or {}#要在工作节点上设置的环境变量 # java gateway must have been launched at this point. #这个时候java的网关必须已经启动 if conf is not None and conf._jconf is not None: # conf has been initialized in JVM properly, so use conf directly. This represent the # scenario that JVM has been launched before SparkConf is created (e.g. SparkContext is # created and then stopped, and we create a new SparkConf and new SparkContext again) #conf已在JVM上被正确初始化,所以直接使用的conf。 #这说明在创建SparkConf之前已经启动了JVM(比如SparkContext被创建了,然后又被停止了,那么我们再创建一个新的SparkConf和新的SparkContext) self._conf = conf else: self._conf = SparkConf(_jvm=SparkContext._jvm) if conf is not None: for k, v in conf.getAll(): self._conf.set(k, v) self._batchSize = batchSize # -1 represents an unlimited batch size :-1表示没有限制块大小 self._unbatched_serializer = serializer #没有分块的序列化对象 if batchSize == 0:#如果块大小为0(不知道分块大小)则将序列化对象自动分块,否则为已分块的序列化对象 self.serializer = AutoBatchedSerializer(self._unbatched_serializer) else: self.serializer = BatchedSerializer(self._unbatched_serializer, batchSize) # Set any parameters passed directly to us on the conf 设置在conf上直接传给我们的参数 if master: self._conf.setMaster(master)#配置master属性 if appName: self._conf.setAppName(appName)#配置appName属性 if sparkHome: self._conf.setSparkHome(sparkHome)#配置sparkHome属性 if environment:#配置环境 for key, value in environment.items(): self._conf.setExecutorEnv(key, value) for key, value in DEFAULT_CONFIGS.items(): self._conf.setIfMissing(key, value) # Check that we have at least the required parameters #检查我们是否有必须具备的参数,从这里可以看出master和appName属性是最小配置,是必须要配置的 if not self._conf.contains("spark.master"):#检查是否配置了master属性,如果没有就抛出异常提示 raise Exception("A master URL must be set in your configuration") if not self._conf.contains("spark.app.name"):#检查是否配置了appName属性,如果没有就抛出异常提示 raise Exception("An application name must be set in your configuration") # Read back our properties from the conf in case we loaded some of them from # the classpath or an external config file #从conf中读回我们需要的属性,以防我们从其他的一些classpath或者外部的配置文件中加载这些属性并被覆盖 self.master = self._conf.get("spark.master") self.appName = self._conf.get("spark.app.name") self.sparkHome = self._conf.get("spark.home", None) for (k, v) in self._conf.getAll(): if k.startswith("spark.executorEnv."): varName = k[len("spark.executorEnv."):] self.environment[varName] = v self.environment["PYTHONHASHSEED"] = os.environ.get("PYTHONHASHSEED", "0") # Create the Java SparkContext through Py4J #通过Py4J创建Java SparkContext self._jsc = jsc or self._initialize_context(self._conf._jconf) # Reset the SparkConf to the one actually used by the SparkContext in JVM. #将SparkConf重置为JVM中SparkContext实际使用的那个。 self._conf = SparkConf(_jconf=self._jsc.sc().conf()) # Create a single Accumulator in Java that we'll send all our updates through; # they will be passed back to us through a TCP server #用Java创建一个单一的Accumulator,以便我们发送所有更新 #他们将通过TCP服务器传回给我们 self._accumulatorServer = accumulators._start_update_server() (host, port) = self._accumulatorServer.server_address self._javaAccumulator = self._jvm.PythonAccumulatorV2(host, port) self._jsc.sc().register(self._javaAccumulator) self.pythonExec = os.environ.get("PYSPARK_PYTHON", 'python')#python执行器的目录 self.pythonVer = "%d.%d" % sys.version_info[:2]#python的版本 if sys.version_info < (2, 7):#如果python版本小于2.7则给出警告:提示spark2.0.0已经不支持python2.6了 warnings.warn("Support for Python 2.6 is deprecated as of Spark 2.0.0") # Broadcast's __reduce__ method stores Broadcast instances here.广播的__reduce__方法在这里存储广播实例。 # This allows other code to determine which Broadcast instances have # been pickled, so it can determine which Java broadcast objects to # send. #它允许其他代码来确定哪些广播实例已经被序列化了,所以它可以确定发送哪个Java广播对象 self._pickled_broadcast_vars = BroadcastPickleRegistry()#在Thread-local中注册已被序列化的广播变量 SparkFiles._sc = self root_dir = SparkFiles.getRootDirectory() sys.path.insert(1, root_dir)#sys.path是个列表,用sys.path.append在末尾添加目录,当这个append执行完之后,新目录即时起效 # Deploy any code dependencies specified in the constructor #部署构造函数中指定的任何代码依赖关系 self._python_includes = list() for path in (pyFiles or []):#为以后在这个SparkContext上执行的所有任务添加需要的.py或.zip依赖项。 self.addPyFile(path)#{path}可以是本地的文件,HDFS(或其他Hadoop支持的文件系统)中的文件,或者HTTP,HTTPS或FTP URI。 # Deploy code dependencies set by spark-submit; these will already have been added # with SparkContext.addFile, so we just need to add them to the PYTHONPATH #部署由spark-submit设置的代码依赖项; #这些依赖需要已经被SparkContext.addFile方法添加进去了,所以我们只需要将它们添加到PYTHONPATH中就可以了 for path in self._conf.get("spark.submit.pyFiles", "").split(","):#通过逗号分割所需要的文件目录 if path != "":#如果文件不为空,则分离出文件的目录和文件名字 (dirname, filename) = os.path.split(path) if filename[-4:].lower() in self.PACKAGE_EXTENSIONS:#如果文件后缀在('.zip', '.egg', '.jar')中,则将该文件名添加到相关依赖的列表中 self._python_includes.append(filename) sys.path.insert(1, os.path.join(SparkFiles.getRootDirectory(), filename))#把下载后的文件(目录/文件名)添加到第1个位置上 # Create a temporary directory inside spark.local.dir: #在spark.local.dir中创建一个临时目录 local_dir = self._jvm.org.apache.spark.util.Utils.getLocalDir(self._jsc.sc().conf()) self._temp_dir = \ self._jvm.org.apache.spark.util.Utils.createTempDir(local_dir, "pyspark") \ .getAbsolutePath() # profiling stats collected for each PythonRDD #为每个PythonRDD解析stats if self._conf.get("spark.python.profile", "false") == "true": dump_path = self._conf.get("spark.python.profile.dump", None) self.profiler_collector = ProfilerCollector(profiler_cls, dump_path)#在每个基础的stage中跟踪不同的解析器。 else: self.profiler_collector = None # create a signal handler which would be invoked on receiving SIGINT #创建一个信号处理程序,它将在接收到SIGINT时被调用 def signal_handler(signal, frame): self.cancelAllJobs()#取消已安排或正在运行的所有作业 raise KeyboardInterrupt()#抛出用户中断执行的异常 # see http://stackoverflow.com/questions/23206787/ if isinstance(threading.current_thread(), threading._MainThread): signal.signal(signal.SIGINT, signal_handler)#预设信号处理函数singnal.signal(signalnum, handler)其中第一个参数是信号量,第二个参数信号处理函数。