Ejemplo n.º 1
0
def increase(name, increment=1):
    """
    累加name对应的counter,累加值为increment

    Args:
      name (str):  counter名称,只接受str类型
      increment (int): 累加值,只接受正数

    Raises:
      error.BigflowPlanningException:   此函数仅允许在 :mod:`Bigflow变换<bigflow.transforms>` 的用户自定义方法(UDF)中调用,否则抛出此异常

    Note:
      1. counter 具有 group 的概念, 如果 name 格式为"group1|name1",则 group1 为 counter 所在 group; 若不包含, 则默认的 group 为 'Flume'

      2. counter 属于一个 Bigflow Pipeline,并在 Pipeline 多次运行时累加,若需要将 counter 清零,请使用 Pipeline 的 :meth:`reset_counter<bigflow.pipeline.pipeline_base.PipelineBase.reset_counter>` 或 :meth:`reset_all_counters<bigflow.pipeline.pipeline_base.PipelineBase.reset_all_counters>` 方法
         当前实现中, reset_counter 是个全局操作. reset_counter 将会重置所有 pipeline 中定义的 counter. 如有多 pipeline 重置 counter 的需求, 请为每个 pipeline 设置不同的 counter idenfier
    >>> from bigflow import base, counter
    >>> _pipeline = base.Pipeline.create("LOCAL")
    >>> _p = _pipeline.parallelize([3, 7, 1])
    >>> def all_num_counting(record):
    ...     counter.increase("all_num")
    ...     return record
    ...
    >>> _p = _p.map(all_num_counting)
    >>> _p.get()
    """
    if os.getenv("__PYTHON_IN_REMOTE_SIDE", None) is None:
        raise error.BigflowPlanningException(
            "counter.increase should only called at runtime")
    if type(name) is not types.StringType:
        name = str(name)
    if name not in counter_dict:
        counter_dict[name] = increment
    else:
        counter_dict[name] += increment
Ejemplo n.º 2
0
 def add_cache_id(self, cache_id):
     """
     save the ptype cache node id for use
     """
     if not isinstance(cache_id, str):
         raise error.BigflowPlanningException(
             "be added cache id should be str")
     self._cache_node_ids.append(cache_id)
Ejemplo n.º 3
0
    def _broadcast(self, side_input_tuple):
        from bigflow.util import broadcast

        broadcasted = []
        for p in side_input_tuple:
            if isinstance(p, PTable):
                raise error.BigflowPlanningException(
                    " PTable can not be broadcasted.")
            if not broadcast.is_same_working_scope(p, self):
                raise error.BigflowPlanningException(
                    "Broadcasted values not in "
                    "correct working scope")
            broadcasted.append(
                broadcast.broadcast_to(p,
                                       broadcast.working_scope(self._value())))

        return tuple(broadcasted)
Ejemplo n.º 4
0
 def set_size(self, size=None, scale_factor=1.0):
     """
     Set data size of this node
     """
     if size is None:
         raise error.BigflowPlanningException(
             "Empty input size for loader")
     return super(LogicalPlan.LoadNode,
                  self).set_size(size, scale_factor)
Ejemplo n.º 5
0
    def _prepare_cache_archive(self):
        logger.info("Checking PreparedArchive for Spark Pipeline...")
        existed = self._client.fs_test(self.prepared_archive_path,
                                       self._hadoop_config)
        tmp_path = self.prepared_archive_path + '-' + str(uuid.uuid4())
        self._job_config.prepared_archive_path = self.prepared_archive_path
        self._job_config.tmp_data_path = tmp_path

        if self._config['reprepare_cache_archive'] or not existed:
            if self._config['reprepare_cache_archive']:
                if not existed:
                    logger.info("Bigflow PreparedArchive does not exist")
                else:
                    logger.info("Re-prepare Bigflow PreparedArchive")
                    self._client.fs_rmr(self.prepared_archive_path,
                                        self._hadoop_config)
            import subprocess

            bigflow_home = self._get_bigflow_python_home()
            local_cache_archive = "bigflow_python_%s.tar.gz" % (str(
                uuid.uuid4()))
            cmd = "tar czf %s -C %s --exclude=flume/worker python_runtime flume" % (
                local_cache_archive, bigflow_home)
            ret = subprocess.call(cmd, shell=True)
            if ret != 0:
                raise error.BigflowPlanningException(
                    "Cannot make PreparedArchive file")
            try:
                self._client.fs_put(local_cache_archive, tmp_path,
                                    self._hadoop_config)
                self._client.fs_mv(tmp_path, self.prepared_archive_path,
                                   self._hadoop_config)
            except error.BigflowHDFSException:
                # only need to delete archive path when exception occurs.
                self._remote_temp_files.append(tmp_path)
                if not self._client.fs_test(self.prepared_archive_path,
                                            self._hadoop_config):
                    msg = "Unable to upload Bigflow PreparedArchive, please " \
                          "make sure you have write permission to " \
                          "tmp_data_path['%s']" % self._config['tmp_data_path']
                    raise error.BigflowHDFSException(msg)
            finally:
                ret = subprocess.call("rm %s" % local_cache_archive,
                                      shell=True)
                self._client.fs_rmr(tmp_path, self._hadoop_config)
        else:
            logger.info("Bigflow PreparedArchive exists already")
Ejemplo n.º 6
0
    def parallelize(self, dataset, **options):
        """
        将一段内存变量映射为一个P类型实例

        Args:
          dataset (object):  任意类型的内存变量
          options:
                serde: 设置dataset的serde对象

        Returns:
          PType:  表示该内存变量的P类型
        """
        objector = options.get("serde", self.default_objector())

        local_input_path = "./.local_input"
        if os.path.isfile(local_input_path):
            raise error.BigflowPlanningException(
                "file ./.local_input exist, "
                "cannot use it as temp directory")
        if not os.path.exists(local_input_path):
            os.makedirs(local_input_path)

        file_name = os.path.abspath(local_input_path + "/" + str(uuid.uuid4()))
        requests.write_record(file_name, utils.flatten_runtime_value(dataset),
                              objector)

        self._local_temp_files.append(file_name)

        node = self.read(input.SequenceFile(file_name, **options)).node()

        nested_level, ptype = utils.detect_ptype(dataset)

        if nested_level < 0:
            return utils.construct(self, node, ptype)
        else:
            from bigflow.transform_impls import group_by

            for i in range(0, nested_level + 1):
                node = group_by.node_group_by(
                    node,
                    lambda x: x[0],
                    lambda x: x[1] if len(x) == 2 else x[1:len(x)],
                    key_serde=self.default_objector(),
                    value_serde=self.default_objector())

            return utils.construct(self, node, ptable.PTable, nested_level,
                                   ptype)
Ejemplo n.º 7
0
    def _set_after_run_hook(self, name, callback):
        """ 注册一个在 pipeline.run() 执行之后的 hook.
            hook 执行顺序: 注册的 name 进行 sorted 排序结果

        todo: deal with callback with parameters. Users can always use closure to convert a callback
              with parameters to a zero-parameter callback
        :param name: 钩子名称
        :param callback: 无参的 callback
        :return: None

        ..Note: This function is provided for advanced usage, please make sure you know what you are
                doing.
        """
        if callable(callback):
            self._after_run_hooks[name] = (callback, )
        else:
            raise error.BigflowPlanningException(
                "Cannot register a non-callable object: %s" % str(callback))
Ejemplo n.º 8
0
    def node(self):
        """
        返回PTable所对应的Node

        Returns:
          LogicalPlan.Node:  node

        Raises:
          BigflowPlanningException:  若无法得到Node

        .. note:: 用户不应当使用此方法

        """
        if self._node is None:
            raise error.BigflowPlanningException(
                "No node in PTable (whose value is %s), "
                "such transform(s) is not supported." % (str(self._value())))
        return self._node
Ejemplo n.º 9
0
    def with_compression(self, compression_type):
        """
        对输出文件进行压缩

        Args:
          compression_type (str):  压缩格式,目前仅支持"gzip"

        Returns:
          TextFile:  返回self
        """
        if compression_type in TextFile.compression_types:
            self.output_format.compression_type = TextFile.compression_types[
                compression_type]
        else:
            raise error.BigflowPlanningException(
                "Unsupported compression types,"
                " must be one of: %s" % TextFile.compression_types.keys())

        return self
Ejemplo n.º 10
0
def _get(name, group=None):
    """ get a counter value with counter_name: name
    If name is canonical, a.k.a the form of 'group|name', then the second group parameter is
    ignored. Otherwise group will be prepended to the name to generate the counter_name. Default
    group name is 'Flume' if no group is supplied.

    ... Note: To get counter that has `|` in its name, 'g1|a|b' for example, use _get('g1|a|b')
              rather than _get('a|b', 'g1')
    """
    if os.getenv("__PYTHON_IN_REMOTE_SIDE", None) is not None:
        raise error.BigflowPlanningException(
            "counter.get should not called at runtime")
    from bigflow.rpc import requests
    result_counters = requests.get_counters()

    group_name = "Flume" if group is None else str(group)
    for index, counter_name in enumerate(result_counters.name):
        counter_key = name if "|" in name else group_name + "|" + name
        if counter_name == counter_key:
            return result_counters.value[index]
Ejemplo n.º 11
0
        def _get_file_size(uri):
            cmd = list()
            if uri.startswith("hdfs://"):
                fs_name_from_path = hadoop_client.extract_fs_name_from_path(
                    uri)
                replace_explicit_fs_name = False

                config = pipeline.config()
                cmd.append(config.hadoop_client_path)
                cmd.append("fs")
                for kv in config.hadoop_job_conf:
                    if kv.key == "fs.defaultFS" and fs_name_from_path is not None:
                        cmd.extend(["-D", kv.key + "=" + fs_name_from_path])
                    else:
                        cmd.extend(["-D", kv.key + "=" + kv.value])
                if not replace_explicit_fs_name and fs_name_from_path is not None:
                    cmd.extend(["-D fs.defaultFS=" + fs_name_from_path])

                cmd.append("-conf %s" % config.hadoop_config_path)
                cmd.append("-dus %s | cut -f 2" % uri)
            else:
                cmd.append("du -s -b %s | cut -f 1" % uri)

            process = subprocess.Popen(" ".join(cmd),
                                       stdout=subprocess.PIPE,
                                       shell=True)
            ret = process.wait()
            if ret != 0:
                raise error.BigflowRPCException(
                    "Error getting file size for uri: %s" % uri)

            size = 0
            try:
                for line in process.stdout.readlines():
                    size += int(line.strip())
            except Exception as e:
                raise error.BigflowPlanningException("Cannot get input size",
                                                     e)
            return size
Ejemplo n.º 12
0
def _get_all(grouped=False):
    """ get the counter dict include all counters. Group name is prepended to the key in the return
        dict if grouped is False.

    :param grouped: boolean, the returned dict should be grouped by group name or not
    :return dict.
    """
    if os.getenv("__PYTHON_IN_REMOTE_SIDE", None) is not None:
        raise error.BigflowPlanningException(
            "counter.get_all should not called at runtime")
    from bigflow.rpc import requests
    result_counters = requests.get_counters()

    c_dict = {}
    if not grouped:
        for index, counter_name in enumerate(result_counters.name):
            c_dict[counter_name] = result_counters.value[index]
    else:
        for index, counter_name in enumerate(result_counters.name):
            group, name = "Flume", counter_name
            if "|" in counter_name:
                group, name = counter_name.split("|", 1)
            c_dict.setdefault(group, {})[name] = result_counters.value[index]
    return c_dict
Ejemplo n.º 13
0
 def _get_bigflow_python_home(self):
     bigflow_home = os.getenv("BIGFLOW_PYTHON_HOME")
     if bigflow_home is None:
         raise error.BigflowPlanningException(
             "BIGFLOW_PYTHON_HOME is not set!")
     return bigflow_home
Ejemplo n.º 14
0
    def __init_server(self, path=None, params=[], port=None):
        cmd = []
        if path is None:
            root = os.path.dirname(bigflow.__file__)
            path = os.getenv("BIGFLOW_SERVER_PATH",
                             "%s/../flume/worker" % root)

        cmd.append(path)

        bigflow_home = os.getenv("BIGFLOW_PYTHON_HOME")

        if bigflow_home is None:
            raise error.BigflowPlanningException(
                "BIGFLOW_PYTHON_HOME is not set!")

        bigflow_home = os.path.abspath(bigflow_home)

        cmd.append("--flume_planner_max_steps=20000")
        keep_resource = os.getenv('BIGFLOW_PYTHON_KEEP_RESOURCE', "false")
        cmd.append("--bigflow_python_keep_resource=" + keep_resource)

        if port is not None:
            cmd.append("--service_port=%d" % port)

        for param in params:
            cmd.append(param)
        self.server = process_util.Subprocess(cmd)

        log_file = os.getenv("BIGFLOW_LOG_FILE_BACKEND")
        if log_file:
            log_file = os.path.abspath(log_file + ".log")
            log_dir = os.path.dirname(log_file)
            if not os.path.isdir(log_dir):
                try:
                    os.makedirs(log_dir)
                except Exception as e:
                    raise error.BigflowPlanningException(
                        "Cannot create log file directory [%s]" % log_dir, e)

            def _log_printer(line):
                self.__backend_log_file.write(line)

            #log.logger.info(cmd)
            log.logger.info("Bigflow Backend log is written to [%s]" %
                            log_file)
            self.__backend_log_file = open(log_file, "w")

            def _stdout_printer(line):
                sys.stderr.write(line)
                sys.stderr.flush()

            self.server.add_stderr_listener(_log_printer)
            self.server.add_stdout_listener(_stdout_printer)
            self.server.add_stderr_listener(backend_parser.backend_parser)

        else:

            def _stderr_printer(line):
                sys.stderr.write(line)
                sys.stderr.flush()

            def _stdout_printer(line):
                sys.stderr.write(line)
                sys.stderr.flush()

            log.logger.info("Bigflow Backend log is written to STDERR")

            self.server.add_stderr_listener(_stderr_printer)
            self.server.add_stdout_listener(_stdout_printer)

        self.server.add_stderr_listener(self._get_service_port)
        return self.server.open()