def increase(name, increment=1): """ 累加name对应的counter,累加值为increment Args: name (str): counter名称,只接受str类型 increment (int): 累加值,只接受正数 Raises: error.BigflowPlanningException: 此函数仅允许在 :mod:`Bigflow变换<bigflow.transforms>` 的用户自定义方法(UDF)中调用,否则抛出此异常 Note: 1. counter 具有 group 的概念, 如果 name 格式为"group1|name1",则 group1 为 counter 所在 group; 若不包含, 则默认的 group 为 'Flume' 2. counter 属于一个 Bigflow Pipeline,并在 Pipeline 多次运行时累加,若需要将 counter 清零,请使用 Pipeline 的 :meth:`reset_counter<bigflow.pipeline.pipeline_base.PipelineBase.reset_counter>` 或 :meth:`reset_all_counters<bigflow.pipeline.pipeline_base.PipelineBase.reset_all_counters>` 方法 当前实现中, reset_counter 是个全局操作. reset_counter 将会重置所有 pipeline 中定义的 counter. 如有多 pipeline 重置 counter 的需求, 请为每个 pipeline 设置不同的 counter idenfier >>> from bigflow import base, counter >>> _pipeline = base.Pipeline.create("LOCAL") >>> _p = _pipeline.parallelize([3, 7, 1]) >>> def all_num_counting(record): ... counter.increase("all_num") ... return record ... >>> _p = _p.map(all_num_counting) >>> _p.get() """ if os.getenv("__PYTHON_IN_REMOTE_SIDE", None) is None: raise error.BigflowPlanningException( "counter.increase should only called at runtime") if type(name) is not types.StringType: name = str(name) if name not in counter_dict: counter_dict[name] = increment else: counter_dict[name] += increment
def add_cache_id(self, cache_id): """ save the ptype cache node id for use """ if not isinstance(cache_id, str): raise error.BigflowPlanningException( "be added cache id should be str") self._cache_node_ids.append(cache_id)
def _broadcast(self, side_input_tuple): from bigflow.util import broadcast broadcasted = [] for p in side_input_tuple: if isinstance(p, PTable): raise error.BigflowPlanningException( " PTable can not be broadcasted.") if not broadcast.is_same_working_scope(p, self): raise error.BigflowPlanningException( "Broadcasted values not in " "correct working scope") broadcasted.append( broadcast.broadcast_to(p, broadcast.working_scope(self._value()))) return tuple(broadcasted)
def set_size(self, size=None, scale_factor=1.0): """ Set data size of this node """ if size is None: raise error.BigflowPlanningException( "Empty input size for loader") return super(LogicalPlan.LoadNode, self).set_size(size, scale_factor)
def _prepare_cache_archive(self): logger.info("Checking PreparedArchive for Spark Pipeline...") existed = self._client.fs_test(self.prepared_archive_path, self._hadoop_config) tmp_path = self.prepared_archive_path + '-' + str(uuid.uuid4()) self._job_config.prepared_archive_path = self.prepared_archive_path self._job_config.tmp_data_path = tmp_path if self._config['reprepare_cache_archive'] or not existed: if self._config['reprepare_cache_archive']: if not existed: logger.info("Bigflow PreparedArchive does not exist") else: logger.info("Re-prepare Bigflow PreparedArchive") self._client.fs_rmr(self.prepared_archive_path, self._hadoop_config) import subprocess bigflow_home = self._get_bigflow_python_home() local_cache_archive = "bigflow_python_%s.tar.gz" % (str( uuid.uuid4())) cmd = "tar czf %s -C %s --exclude=flume/worker python_runtime flume" % ( local_cache_archive, bigflow_home) ret = subprocess.call(cmd, shell=True) if ret != 0: raise error.BigflowPlanningException( "Cannot make PreparedArchive file") try: self._client.fs_put(local_cache_archive, tmp_path, self._hadoop_config) self._client.fs_mv(tmp_path, self.prepared_archive_path, self._hadoop_config) except error.BigflowHDFSException: # only need to delete archive path when exception occurs. self._remote_temp_files.append(tmp_path) if not self._client.fs_test(self.prepared_archive_path, self._hadoop_config): msg = "Unable to upload Bigflow PreparedArchive, please " \ "make sure you have write permission to " \ "tmp_data_path['%s']" % self._config['tmp_data_path'] raise error.BigflowHDFSException(msg) finally: ret = subprocess.call("rm %s" % local_cache_archive, shell=True) self._client.fs_rmr(tmp_path, self._hadoop_config) else: logger.info("Bigflow PreparedArchive exists already")
def parallelize(self, dataset, **options): """ 将一段内存变量映射为一个P类型实例 Args: dataset (object): 任意类型的内存变量 options: serde: 设置dataset的serde对象 Returns: PType: 表示该内存变量的P类型 """ objector = options.get("serde", self.default_objector()) local_input_path = "./.local_input" if os.path.isfile(local_input_path): raise error.BigflowPlanningException( "file ./.local_input exist, " "cannot use it as temp directory") if not os.path.exists(local_input_path): os.makedirs(local_input_path) file_name = os.path.abspath(local_input_path + "/" + str(uuid.uuid4())) requests.write_record(file_name, utils.flatten_runtime_value(dataset), objector) self._local_temp_files.append(file_name) node = self.read(input.SequenceFile(file_name, **options)).node() nested_level, ptype = utils.detect_ptype(dataset) if nested_level < 0: return utils.construct(self, node, ptype) else: from bigflow.transform_impls import group_by for i in range(0, nested_level + 1): node = group_by.node_group_by( node, lambda x: x[0], lambda x: x[1] if len(x) == 2 else x[1:len(x)], key_serde=self.default_objector(), value_serde=self.default_objector()) return utils.construct(self, node, ptable.PTable, nested_level, ptype)
def _set_after_run_hook(self, name, callback): """ 注册一个在 pipeline.run() 执行之后的 hook. hook 执行顺序: 注册的 name 进行 sorted 排序结果 todo: deal with callback with parameters. Users can always use closure to convert a callback with parameters to a zero-parameter callback :param name: 钩子名称 :param callback: 无参的 callback :return: None ..Note: This function is provided for advanced usage, please make sure you know what you are doing. """ if callable(callback): self._after_run_hooks[name] = (callback, ) else: raise error.BigflowPlanningException( "Cannot register a non-callable object: %s" % str(callback))
def node(self): """ 返回PTable所对应的Node Returns: LogicalPlan.Node: node Raises: BigflowPlanningException: 若无法得到Node .. note:: 用户不应当使用此方法 """ if self._node is None: raise error.BigflowPlanningException( "No node in PTable (whose value is %s), " "such transform(s) is not supported." % (str(self._value()))) return self._node
def with_compression(self, compression_type): """ 对输出文件进行压缩 Args: compression_type (str): 压缩格式,目前仅支持"gzip" Returns: TextFile: 返回self """ if compression_type in TextFile.compression_types: self.output_format.compression_type = TextFile.compression_types[ compression_type] else: raise error.BigflowPlanningException( "Unsupported compression types," " must be one of: %s" % TextFile.compression_types.keys()) return self
def _get(name, group=None): """ get a counter value with counter_name: name If name is canonical, a.k.a the form of 'group|name', then the second group parameter is ignored. Otherwise group will be prepended to the name to generate the counter_name. Default group name is 'Flume' if no group is supplied. ... Note: To get counter that has `|` in its name, 'g1|a|b' for example, use _get('g1|a|b') rather than _get('a|b', 'g1') """ if os.getenv("__PYTHON_IN_REMOTE_SIDE", None) is not None: raise error.BigflowPlanningException( "counter.get should not called at runtime") from bigflow.rpc import requests result_counters = requests.get_counters() group_name = "Flume" if group is None else str(group) for index, counter_name in enumerate(result_counters.name): counter_key = name if "|" in name else group_name + "|" + name if counter_name == counter_key: return result_counters.value[index]
def _get_file_size(uri): cmd = list() if uri.startswith("hdfs://"): fs_name_from_path = hadoop_client.extract_fs_name_from_path( uri) replace_explicit_fs_name = False config = pipeline.config() cmd.append(config.hadoop_client_path) cmd.append("fs") for kv in config.hadoop_job_conf: if kv.key == "fs.defaultFS" and fs_name_from_path is not None: cmd.extend(["-D", kv.key + "=" + fs_name_from_path]) else: cmd.extend(["-D", kv.key + "=" + kv.value]) if not replace_explicit_fs_name and fs_name_from_path is not None: cmd.extend(["-D fs.defaultFS=" + fs_name_from_path]) cmd.append("-conf %s" % config.hadoop_config_path) cmd.append("-dus %s | cut -f 2" % uri) else: cmd.append("du -s -b %s | cut -f 1" % uri) process = subprocess.Popen(" ".join(cmd), stdout=subprocess.PIPE, shell=True) ret = process.wait() if ret != 0: raise error.BigflowRPCException( "Error getting file size for uri: %s" % uri) size = 0 try: for line in process.stdout.readlines(): size += int(line.strip()) except Exception as e: raise error.BigflowPlanningException("Cannot get input size", e) return size
def _get_all(grouped=False): """ get the counter dict include all counters. Group name is prepended to the key in the return dict if grouped is False. :param grouped: boolean, the returned dict should be grouped by group name or not :return dict. """ if os.getenv("__PYTHON_IN_REMOTE_SIDE", None) is not None: raise error.BigflowPlanningException( "counter.get_all should not called at runtime") from bigflow.rpc import requests result_counters = requests.get_counters() c_dict = {} if not grouped: for index, counter_name in enumerate(result_counters.name): c_dict[counter_name] = result_counters.value[index] else: for index, counter_name in enumerate(result_counters.name): group, name = "Flume", counter_name if "|" in counter_name: group, name = counter_name.split("|", 1) c_dict.setdefault(group, {})[name] = result_counters.value[index] return c_dict
def _get_bigflow_python_home(self): bigflow_home = os.getenv("BIGFLOW_PYTHON_HOME") if bigflow_home is None: raise error.BigflowPlanningException( "BIGFLOW_PYTHON_HOME is not set!") return bigflow_home
def __init_server(self, path=None, params=[], port=None): cmd = [] if path is None: root = os.path.dirname(bigflow.__file__) path = os.getenv("BIGFLOW_SERVER_PATH", "%s/../flume/worker" % root) cmd.append(path) bigflow_home = os.getenv("BIGFLOW_PYTHON_HOME") if bigflow_home is None: raise error.BigflowPlanningException( "BIGFLOW_PYTHON_HOME is not set!") bigflow_home = os.path.abspath(bigflow_home) cmd.append("--flume_planner_max_steps=20000") keep_resource = os.getenv('BIGFLOW_PYTHON_KEEP_RESOURCE', "false") cmd.append("--bigflow_python_keep_resource=" + keep_resource) if port is not None: cmd.append("--service_port=%d" % port) for param in params: cmd.append(param) self.server = process_util.Subprocess(cmd) log_file = os.getenv("BIGFLOW_LOG_FILE_BACKEND") if log_file: log_file = os.path.abspath(log_file + ".log") log_dir = os.path.dirname(log_file) if not os.path.isdir(log_dir): try: os.makedirs(log_dir) except Exception as e: raise error.BigflowPlanningException( "Cannot create log file directory [%s]" % log_dir, e) def _log_printer(line): self.__backend_log_file.write(line) #log.logger.info(cmd) log.logger.info("Bigflow Backend log is written to [%s]" % log_file) self.__backend_log_file = open(log_file, "w") def _stdout_printer(line): sys.stderr.write(line) sys.stderr.flush() self.server.add_stderr_listener(_log_printer) self.server.add_stdout_listener(_stdout_printer) self.server.add_stderr_listener(backend_parser.backend_parser) else: def _stderr_printer(line): sys.stderr.write(line) sys.stderr.flush() def _stdout_printer(line): sys.stderr.write(line) sys.stderr.flush() log.logger.info("Bigflow Backend log is written to STDERR") self.server.add_stderr_listener(_stderr_printer) self.server.add_stdout_listener(_stdout_printer) self.server.add_stderr_listener(self._get_service_port) return self.server.open()