def add_file(self, file_path, resource_path=None, executable=False): """ Add a single file to be packed with the job at runtime. Args: file_path (str): path of file to add resource_path (str): path of file at runtime executable (bool): if True, the file can be executed Raises: ValueError: If invalid arguments are given """ if resource_path is None: resource_path = os.path.basename(file_path) if not isinstance(file_path, str) or not isinstance(resource_path, str): raise ValueError("Invalid input path: must be str") # TODO(wangcong09): warn users if they specified different file names if resource_path in self.__file_targets: logger.warn("add [%s] duplicated" % resource_path) return self.__files.append((file_path, resource_path, executable)) self.__file_targets.add(resource_path)
def _set_sys_defaultencoding(self, will_pass_encoding=None): """ Pass sys default encoding to remote side. """ import sys if will_pass_encoding is None: """ By default, if user has reloaded module "sys", we should pass defaultencoding to the remote side. Because "reload(sys)" and "sys.setdefaultencoding(encoding)" are most probably related. TODO(zhangyuncong): We should have a further discussion about whether we should pass the encoding by default or not. """ will_pass_encoding = hasattr(sys, 'setdefaultencoding') if will_pass_encoding: default_encoding = sys.getdefaultencoding() logger.warn("pass defaultencoding %s to the remote side" % default_encoding) def set_default_encoding(): import sys reload(sys) sys.setdefaultencoding(default_encoding) # \0 makes sure this hook will run before user's self.set_init_hook('\0set_sys_defaultencoding', set_default_encoding)
def wrapper(test_class_obj): """ inner """ _first_run = True if test_class_obj.pipeline_type in modes: for filesystem in expect_filesystems: if filesystem in test_class_obj.support_file_system: if _skip_filesystem_test(filesystem): continue test_class_obj.root_path = test_class_obj.root_path_dict[ filesystem] test_class_obj.running_on_filesystem = filesystem logger.info( "running case [%s.%s] root_path=[%s], filesystem=[%s]" % (type(test_class_obj).__name__, fn.func_name, test_class_obj.root_path, test_class_obj.running_on_filesystem)) if not _first_run: test_class_obj.tearDown() test_class_obj.setUp() fn(test_class_obj) _first_run = False else: logger.warn('\033[01;31mWarning!!! %s not executed,' \ ' because filesystem is %s.\033[00m' \ %(fn.__name__, filesystem))
def add_dynamic_library(self, file_path): """ Add a dynamic library file(.so) to be packed with the job and set it to LD_LIBRARY_PATH at runtime. Args: file_path: Path of the library file """ path = os.path.abspath(file_path) file_name = os.path.basename(path) if file_name in self.__library_targets: logger.warn("add [%s] duplicated" % file_path) return self.__libraries.append((file_name, path)) self.__library_targets.add(file_name)
def add_file(self, file_path, resource_path=None, executable=False): """ 向Pipeline添加单个文件,使得该文件能够在运行期被访问 Args: file_path(str): 需要添加的文件路径,支持本地, HDFS 路径 resource_path (str): 远端运行时访问该文件的本地路径, 应是相对路径. 也即在远端, file_path 将会被映射 成该 resource_path 路径, 用户程序可以直接用该路径访问到 file_path 对应的文件 executable (bool): 若为True,则该文件在运行期会被添加可执行属性 """ if path_util.is_hdfs_path(file_path.lower()): if executable: logger.warn("Set executable for cache file is not supported yet, " "ignore executable property") self.__append_cache_file(file_path, resource_path, executable) else: self._resource.add_file(file_path, resource_path, executable)
def add_file(self, file_path, resource_path=None, executable=False): """ 向Pipeline添加单个文件,使得该文件能够在运行期被访问 Args: file_path(str): 需要添加的文件路径,支持本地, HDFS 路径 resource_path (str): 远端运行时访问该文件的本地路径, 应是相对路径. 也即在远端, file_path 将会被映射 成该 resource_path 路径, 用户程序可以直接用该路径访问到 file_path 对应的文件 executable (bool): 若为True,则该文件在运行期会被添加可执行属性 """ if path_util.is_hdfs_path(file_path.lower()): if executable: logger.warn( "Set executable for cache file is not supported yet, " "ignore executable property") self.__append_cache_file(file_path, resource_path, executable) else: self._resource.add_file(file_path, resource_path, executable)
def add_file_from_bytes(self, source_bytes, resource_path=None): """ Add a single file to be packed with the job at runtime. Args: source_bytes (str): the source binaries resource_path (str): path of file at runtime executable (bool): if True, the file can be executed Raises: ValueError: If invalid arguments are given """ if not isinstance(source_bytes, str) or not isinstance(resource_path, str): raise ValueError("Invalid source bytes: must be str") if resource_path in self.__file_targets: logger.warn("add [%s] duplicated" % resource_path) return self.__binary_files.append((resource_path, source_bytes)) self.__file_targets.add(resource_path)
def add_egg_file(self, file_path): """ Add an .egg file to be packed with the job and set its path to PYTHONPATH at runtime Args: file_path (str): path of .egg file Raises: ValueError: If invalid arguments are given """ if not isinstance(file_path, str): raise ValueError("Invalid input path: must be str") import sys import traceback path = os.path.abspath(file_path) file_name = os.path.basename(path) if file_name in self.__egg_file_targets: logger.warn("add [%s] duplicated" % file_name) return self.__egg_files.append((file_name, path)) self.__egg_file_targets.add(file_name)
def wrapper(test_class_obj): """ inner """ _first_run = True if test_class_obj.pipeline_type in modes: for filesystem in expect_filesystems: if filesystem in test_class_obj.support_file_system: if _skip_filesystem_test(filesystem): continue test_class_obj.root_path = test_class_obj.root_path_dict[filesystem] test_class_obj.running_on_filesystem = filesystem logger.info("running case [%s.%s] root_path=[%s], filesystem=[%s]" % (type(test_class_obj).__name__, fn.func_name, test_class_obj.root_path, test_class_obj.running_on_filesystem)) if not _first_run: test_class_obj.tearDown() test_class_obj.setUp() fn(test_class_obj) _first_run = False else: logger.warn('\033[01;31mWarning!!! %s not executed,' \ ' because filesystem is %s.\033[00m' \ %(fn.__name__, filesystem))
def __init__(self, **pipeline_options): super(SparkPipeline, self).__init__(**pipeline_options) self._type_str = "SPARK" self._local_uri_infos = [] self._default_job_name = self._get_default_job_name() if "hadoop_config_path" in pipeline_options: pipeline_options["hadoop_config_path"] = os.path.abspath( pipeline_options["hadoop_config_path"]) if "hadoop_client_path" in pipeline_options: pipeline_options["hadoop_client_path"] = os.path.abspath( pipeline_options["hadoop_client_path"]) if "spark_home_path" in pipeline_options: pipeline_options["spark_home_path"] = os.path.abspath( pipeline_options["spark_home_path"]) class _DelayParam(object): """ inner function""" def __init__(self, fn): """ inner function""" self.__fn = fn def get(self): """ inner function""" return self.__fn() # config as pb message self._job_config = config_pb2.PbSparkConfig() def _get_reprepare_cache_archive(): reprepare = os.getenv('BIGFLOW_REPREPARE_CACHE_ARCHIVE') if not reprepare: reprepare = False elif 'true' == reprepare.lower(): reprepare = True else: reprepare = False return reprepare from bigflow import serde # config as dict self._default_spark_conf = { "spark.app.name": self._default_job_name, "spark.master": "yarn", "spark.local.dir": ".bigflow.on.spark", "spark.executor.extraClassPath": "spark_launcher.jar", "spark.executorEnv.PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION": "python", "spark.hadoop.fs.file.impl": "org.apache.hadoop.fs.LocalFileSystem", #"spark.hadoop.fs.hdfs.impl": "org.apache.hadoop.fs.DFileSystem", } default_merging_spark_conf = { # (key, value, separator, prepend) ("spark.executor.extraLibraryPath", ".:__bigflow_on_spark__/flume:__bigflow_on_spark__/python_runtime/lib", ":", True) } self._config = { 'hadoop_config_path': _DelayParam(requests.default_hadoop_config_path), 'hadoop_client_path': _DelayParam(requests.default_hadoop_client_path), 'spark_home_path': _DelayParam(requests.default_spark_home_path), 'default_serde': serde.DefaultSerde(), 'spark_conf': {}, 'reprepare_cache_archive': _DelayParam(_get_reprepare_cache_archive), 'bigflow_version': bigflow_version.bigflow_version, # now only support cpu profiling 'cpu_profile': False, 'heap_profile': False, } # update config by pipeline options self._config.update(pipeline_options) self._default_spark_conf.update(pipeline_options.get("spark_conf", {})) # merge spark configs which should not be simply replaced. for (k, v, sep, prepend) in default_merging_spark_conf: original_v = self._default_spark_conf.get(k) if original_v is None: self._default_spark_conf[k] = v else: merged_v = v + sep + original_v if prepend else original_v + sep + v self._default_spark_conf[k] = merged_v # Accept job_name as spark application name if self._config.get("job_name"): self._default_spark_conf["spark.app.name"] = self._config[ "job_name"] self._config["spark_conf"] = self._default_spark_conf for key in self._config.keys(): if isinstance(self._config[key], _DelayParam): self._config[key] = self._config[key].get() # check spark_home is set and valid spark_home_path = self._config["spark_home_path"] assert spark_home_path, "Spark home is not set, please specify spark home by " \ "Pipeline.create or by setting SPARK_HOME environment variable" assert os.path.isdir(spark_home_path), "Specified spark_home: %s is not a valid path, " \ "" % spark_home_path # insert spark's core-site.xml over default hadoop client's config path unless explicitly # specified. I still think this's debatable, if not ("hadoop_config_path" in pipeline_options or "HADOOP_CONF_PATH" in os.environ): self._config["hadoop_config_path"] = os.path.join( spark_home_path, "conf/core-site.xml") for (k, v) in self._config['spark_conf'].items(): kv = self._job_config.kv_config.add() kv.key = k kv.value = v # set cpu and heap profiling switch. self._job_config.cpu_profile = self._config.get("cpu_profile", False) self._job_config.heap_profile = self._config.get("heap_profile", False) self._job_config.hadoop_config_path = self._config[ 'hadoop_config_path'] self._job_config.hadoop_client_path = self._config[ 'hadoop_client_path'] self._job_config.spark_home_path = self._config['spark_home_path'] if not 'tmp_data_path' in self._config.keys(): err_msg = "Please set tmp_data_path to a writable HDFS dir" \ + " when you use hadoop/dagmr pipeline to run Bigflow." logger.warn(err_msg) raise error.InvalidConfException(err_msg) if not self._config['tmp_data_path'].startswith('hdfs://'): self._config[ 'tmp_data_path'] = "hdfs://" + self._config['tmp_data_path'] err_msg = "!!!!! Your tmp_data_path is not start with hdfs://, " \ + "so Bigflow set `hdfs://` by default. !!!!!" logger.warn(err_msg) self._config['tmp_data_path'] = os.path.join( self._config['tmp_data_path'], self._config['bigflow_version']) self._job_config.tmp_data_path = self._config['tmp_data_path'] self.prepared_archive_path = self._config['tmp_data_path'] \ + "/" + SparkPipeline.cache_archive_file_name self._job_config.prepared_archive_path = self.prepared_archive_path if 'default_concurrency' in self._config: self._job_config.default_concurrency = self._config[ 'default_concurrency'] pb = pipeline_pb2.PbPipeline() pb.type = pipeline_pb2.PbPipeline.SPARK pb.spark_config.CopyFrom(self._job_config) requests.register_pipeline(pb, self.id()) logger.debug("Register Pipeline %s OK" % self.id()) self._pipeline_tmp_dir = os.path.join(self._job_config.tmp_data_path, 'pipeline', self.id()) self._local_exception_path = os.path.join('.tmp', self.id(), 'exception') self._exception_path = os.path.join(self._pipeline_tmp_dir, 'exception_dir', 'exception') self._set_python_path_in_init_hooks() self._is_first_run = True self._client = hadoop_client.HadoopClient( self._job_config.hadoop_client_path, self._job_config.hadoop_config_path)
def __init__(self, **pipeline_options): super(SparkPipeline, self).__init__(**pipeline_options) self._type_str = "SPARK" self._local_uri_infos = [] self._default_job_name = self._get_default_job_name() if "hadoop_config_path" in pipeline_options: pipeline_options["hadoop_config_path"] = os.path.abspath(pipeline_options["hadoop_config_path"]) if "hadoop_client_path" in pipeline_options: pipeline_options["hadoop_client_path"] = os.path.abspath(pipeline_options["hadoop_client_path"]) if "spark_home_path" in pipeline_options: pipeline_options["spark_home_path"] = os.path.abspath(pipeline_options["spark_home_path"]) class _DelayParam(object): """ inner function""" def __init__(self, fn): """ inner function""" self.__fn = fn def get(self): """ inner function""" return self.__fn() # config as pb message self._job_config = config_pb2.PbSparkConfig() def _get_reprepare_cache_archive(): reprepare = os.getenv('BIGFLOW_REPREPARE_CACHE_ARCHIVE') if not reprepare: reprepare = False elif 'true' == reprepare.lower(): reprepare = True else: reprepare = False return reprepare from bigflow import serde # config as dict self._default_spark_conf = { "spark.app.name": self._default_job_name, "spark.master": "yarn", "spark.local.dir": ".bigflow.on.spark", "spark.executor.extraClassPath": "spark_launcher.jar", "spark.executorEnv.PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION":"python", "spark.hadoop.fs.file.impl": "org.apache.hadoop.fs.LocalFileSystem", #"spark.hadoop.fs.hdfs.impl": "org.apache.hadoop.fs.DFileSystem", } default_merging_spark_conf = { # (key, value, separator, prepend) ("spark.executor.extraLibraryPath", ".:__bigflow_on_spark__/flume:__bigflow_on_spark__/python_runtime/lib", ":", True) } self._config = { 'hadoop_config_path': _DelayParam(requests.default_hadoop_config_path), 'hadoop_client_path': _DelayParam(requests.default_hadoop_client_path), 'spark_home_path': _DelayParam(requests.default_spark_home_path), 'default_serde': serde.DefaultSerde(), 'spark_conf': {}, 'reprepare_cache_archive': _DelayParam(_get_reprepare_cache_archive), 'bigflow_version': bigflow_version.bigflow_version, # now only support cpu profiling 'cpu_profile': False, 'heap_profile': False, } # update config by pipeline options self._config.update(pipeline_options) self._default_spark_conf.update(pipeline_options.get("spark_conf", {})) # merge spark configs which should not be simply replaced. for (k, v, sep, prepend) in default_merging_spark_conf: original_v = self._default_spark_conf.get(k) if original_v is None: self._default_spark_conf[k] = v else: merged_v = v + sep + original_v if prepend else original_v + sep + v self._default_spark_conf[k] = merged_v # Accept job_name as spark application name if self._config.get("job_name"): self._default_spark_conf["spark.app.name"] = self._config["job_name"] self._config["spark_conf"] = self._default_spark_conf for key in self._config.keys(): if isinstance(self._config[key], _DelayParam): self._config[key] = self._config[key].get() # check spark_home is set and valid spark_home_path = self._config["spark_home_path"] assert spark_home_path, "Spark home is not set, please specify spark home by " \ "Pipeline.create or by setting SPARK_HOME environment variable" assert os.path.isdir(spark_home_path), "Specified spark_home: %s is not a valid path, " \ "" % spark_home_path # insert spark's core-site.xml over default hadoop client's config path unless explicitly # specified. I still think this's debatable, if not ("hadoop_config_path" in pipeline_options or "HADOOP_CONF_PATH" in os.environ): self._config["hadoop_config_path"] = os.path.join(spark_home_path, "conf/core-site.xml") for (k, v) in self._config['spark_conf'].items(): kv = self._job_config.kv_config.add() kv.key = k kv.value = v # set cpu and heap profiling switch. self._job_config.cpu_profile = self._config.get("cpu_profile", False) self._job_config.heap_profile = self._config.get("heap_profile", False) self._job_config.hadoop_config_path = self._config['hadoop_config_path'] self._job_config.hadoop_client_path = self._config['hadoop_client_path'] self._job_config.spark_home_path = self._config['spark_home_path'] if not 'tmp_data_path' in self._config.keys(): err_msg = "Please set tmp_data_path to a writable HDFS dir" \ + " when you use hadoop/dagmr pipeline to run Bigflow." logger.warn(err_msg) raise error.InvalidConfException(err_msg) if not self._config['tmp_data_path'].startswith('hdfs://'): self._config['tmp_data_path'] = "hdfs://" + self._config['tmp_data_path'] err_msg = "!!!!! Your tmp_data_path is not start with hdfs://, " \ + "so Bigflow set `hdfs://` by default. !!!!!" logger.warn(err_msg) self._config['tmp_data_path'] = os.path.join( self._config['tmp_data_path'], self._config['bigflow_version'] ) self._job_config.tmp_data_path = self._config['tmp_data_path'] self.prepared_archive_path = self._config['tmp_data_path'] \ + "/" + SparkPipeline.cache_archive_file_name self._job_config.prepared_archive_path = self.prepared_archive_path if 'default_concurrency' in self._config: self._job_config.default_concurrency = self._config['default_concurrency'] pb = pipeline_pb2.PbPipeline() pb.type = pipeline_pb2.PbPipeline.SPARK pb.spark_config.CopyFrom(self._job_config) requests.register_pipeline(pb, self.id()) logger.debug("Register Pipeline %s OK" % self.id()) self._pipeline_tmp_dir = os.path.join(self._job_config.tmp_data_path, 'pipeline', self.id()) self._local_exception_path = os.path.join('.tmp', self.id(), 'exception') self._exception_path = os.path.join(self._pipeline_tmp_dir, 'exception_dir', 'exception') self._set_python_path_in_init_hooks() self._is_first_run = True self._client = hadoop_client.HadoopClient(self._job_config.hadoop_client_path, self._job_config.hadoop_config_path)