Ejemplo n.º 1
0
    def allocate(self, size):
        logger.debug("allocating {0[0]:.2f} {0[1]} in cache '{1}'".format(
            human_bytes(size), self))

        with self._lock_global():
            # determine stats and current cache size
            file_stats = []
            for elem in os.listdir(self.base):
                if elem.endswith(self.lock_postfix):
                    continue
                cpath = os.path.join(self.base, elem)
                file_stats.append((cpath, os.stat(cpath)))
            current_size = sum(stat.st_size for _, stat in file_stats)

            # get the available space of the disk that contains the cache
            fs_stat = os.statvfs(self.base)
            full_size = fs_stat.f_frsize * fs_stat.f_blocks
            free_size = fs_stat.f_frsize * fs_stat.f_bavail

            # leave 10% total free space
            free_size -= 0.1 * full_size
            full_size *= 0.9

            # make sure max_size is always smaller than what is actually possible
            if self.max_size < 0:
                max_size = current_size + free_size
            else:
                max_size = min(self.max_size, current_size + free_size)

            # determine the size of files that need to be deleted
            delete_size = current_size + size - max_size
            if delete_size <= 0:
                logger.debug(
                    "cache space sufficient, {0[0]:.2f} {0[1]} bytes remaining"
                    .format(human_bytes(-delete_size)))
                return

            logger.debug("need to delete {0[0]:.2f} {0[1]} bytes".format(
                human_bytes(delete_size)))

            # delete files, ordered by their access time, skip locked ones
            for cpath, cstat in sorted(file_stats,
                                       key=lambda tpl: tpl[1].st_atime):
                if self._is_locked(cpath):
                    continue
                self._remove(cpath)
                delete_size -= cstat.st_size
                if delete_size <= 0:
                    break
            else:
                logger.warning(
                    "could not allocate remaining {0[0]:.2f} {0[1]}".format(
                        human_bytes(delete_size)))
Ejemplo n.º 2
0
    def allocate(self, size):
        logger.debug("allocating {0[0]:.2f} {0[1]} in cache '{1}'".format(
            human_bytes(size), self))

        # determine stats and current cache size
        file_stats = []
        for elem in os.listdir(self.base):
            if elem.endswith(self.lock_postfix):
                continue
            cpath = os.path.join(self.base, elem)
            file_stats.append((cpath, os.stat(cpath)))
        current_size = sum(stat.st_size for _, stat in file_stats)

        # get the available space of the disk that contains the cache in bytes, leave 10%
        fs_stat = os.statvfs(self.base)
        free_size = fs_stat.f_frsize * fs_stat.f_bavail * 0.9

        # determine the maximum size of the cache
        # make sure it is always smaller than what is available
        if self.max_size <= 0:
            max_size = current_size + free_size
        else:
            max_size = min(self.max_size * 1024**2, current_size + free_size)

        # determine the size of files that need to be deleted
        delete_size = current_size + size - max_size
        if delete_size <= 0:
            logger.debug(
                "cache space sufficient, {0[0]:.2f} {0[1]} remaining".format(
                    human_bytes(-delete_size)))
            return True

        logger.info("need to delete {0[0]:.2f} {0[1]} from cache".format(
            human_bytes(delete_size)))

        # delete files, ordered by their access time, skip locked ones
        for cpath, cstat in sorted(file_stats,
                                   key=lambda tpl: tpl[1].st_atime):
            if self._is_locked(cpath):
                continue
            self._remove(cpath)
            delete_size -= cstat.st_size
            if delete_size <= 0:
                return True

        logger.warning(
            "could not allocate remaining {0[0]:.2f} {0[1]} in cache".format(
                human_bytes(delete_size)))

        return False
Ejemplo n.º 3
0
    def serialize(self, value):
        """"""
        if not value:
            value = 0

        value_bytes = parse_bytes(value, input_unit=self.unit, unit="bytes")
        v, u = human_bytes(value_bytes, unit=self.unit)

        return "{}{}".format(try_int(v), u)
Ejemplo n.º 4
0
Archivo: file.py Proyecto: riga/law
    def _repr_pairs(self, color=True):
        pairs = super(FileSystemTarget, self)._repr_pairs()

        # add the path
        cfg = Config.instance()
        expand = cfg.get_expanded_boolean("target", "expand_path_repr")
        pairs.append(("path", self.path if expand else self.unexpanded_path))

        # optionally add the file size
        if cfg.get_expanded_boolean("target", "filesize_repr"):
            stat = self.exists(stat=True)
            pairs.append(("size", human_bytes(stat.st_size, fmt="{:.1f}{}") if stat else "-"))

        return pairs
Ejemplo n.º 5
0
    def merge(inputs, output):
        with task.publish_step("merging {} parquet files ...".format(
                len(inputs)),
                               runtime=True):
            # clear the output if necessary
            if output.exists() and force:
                output.remove()

            if len(inputs) == 1:
                output.copy_from_local(inputs[0])
            else:
                # merge
                merge_parquet_files([inp.path for inp in inputs],
                                    output.path,
                                    writer_opts=writer_opts)

        # print the size
        output_size = human_bytes(output.stat().st_size, fmt=True)
        task.publish_message(f"merged file size: {output_size}")
Ejemplo n.º 6
0
Archivo: util.py Proyecto: meliache/law
def hadd_task(task, inputs, output, cwd=None, local=False, force=True):
    """
    This method is intended to be used by tasks that are supposed to merge root files, e.g. when
    inheriting from :py:class:`law.contrib.tasks.MergeCascade`. *inputs* should be a sequence of
    local targets that represent the files to merge into *output*. *cwd* is the working directory
    in which hadd is invoked. When empty, a temporary directory is used. The *task* itself is
    used to print and publish messages via its :py:meth:`law.Task.publish_message` and
    :py:meth:`law.Task.publish_step` methods.

    When *local* is *True*, the input and output targets are assumed to be local and the merging is
    based on their local paths. Otherwise, the targets are fetched first and the output target is
    localized.

    When *force* is *True*, any existing output file is overwritten (by adding the ``-f`` flag to
    ``hadd``).
    """
    # ensure inputs are targets
    inputs = [
        LocalFileTarget(inp) if isinstance(inp, six.string_types) else inp
        for inp in inputs
    ]

    # ensure output is a target
    if isinstance(output, six.string_types):
        output = LocalFileTarget(output)

    # default cwd
    if not cwd:
        cwd = LocalDirectoryTarget(is_tmp=True)
    elif isinstance(cwd, six.string_types):
        cwd = LocalDirectoryTarget(cwd)
    cwd.touch()

    # helper to create the hadd cmd
    def hadd_cmd(input_paths, output_path):
        cmd = ["hadd", "-n", "0"]
        if force:
            cmd.append("-f")
        cmd.extend(["-d", cwd.path])
        cmd.append(output_path)
        cmd.extend(input_paths)
        return quote_cmd(cmd)

    if local:
        # when local, there is no need to download inputs
        input_paths = [inp.path for inp in inputs]

        with task.publish_step("merging ...", runtime=True):
            if len(inputs) == 1:
                output.copy_from_local(inputs[0])
            else:
                # merge using hadd
                cmd = hadd_cmd(input_paths, output.path)
                code = interruptable_popen(cmd, shell=True, executable="/bin/bash")[0]
                if code != 0:
                    raise Exception("hadd failed")

        task.publish_message("merged file size: {}".format(human_bytes(
            output.stat.st_size, fmt=True)))

    else:
        # when not local, we need to fetch files first into the cwd
        with task.publish_step("fetching inputs ...", runtime=True):
            def fetch(inp):
                inp.copy_to_local(cwd.child(inp.unique_basename, type="f"), cache=False)
                return inp.unique_basename

            def callback(i):
                task.publish_message("fetch file {} / {}".format(i + 1, len(inputs)))

            bases = map_verbose(fetch, inputs, every=5, callback=callback)

        # start merging into the localized output
        with output.localize("w", cache=False) as tmp_out:
            with task.publish_step("merging ...", runtime=True):
                if len(bases) == 1:
                    tmp_out.path = cwd.child(bases[0]).path
                else:
                    # merge using hadd
                    cmd = hadd_cmd(bases, tmp_out.path)
                    code = interruptable_popen(cmd, shell=True, executable="/bin/bash",
                        cwd=cwd.path)[0]
                    if code != 0:
                        raise Exception("hadd failed")

                    task.publish_message("merged file size: {}".format(human_bytes(
                        tmp_out.stat.st_size, fmt=True)))
Ejemplo n.º 7
0
def fetch_task_output(task,
                      max_depth=0,
                      mode=None,
                      target_dir=".",
                      include_external=False):
    from law.task.base import ExternalTask
    from law.workflow.base import BaseWorkflow

    max_depth = int(max_depth)
    print("fetch task output with max_depth {}".format(max_depth))

    target_dir = os.path.normpath(os.path.abspath(target_dir))
    print("target directory is {}".format(target_dir))
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    include_external = flag_to_bool(include_external)
    if include_external:
        print("include external tasks")

    # determine the mode, i.e., all, dry, interactive
    modes = ["i", "a", "d"]
    mode_names = ["interactive", "all", "dry"]
    if mode is None:
        mode = query_choice("fetch mode?",
                            modes,
                            default="i",
                            descriptions=mode_names)
    elif isinstance(mode, int):
        mode = modes[mode]
    else:
        mode = mode[0].lower()
    if mode not in modes:
        raise Exception("unknown removal mode '{}'".format(mode))
    mode_name = mode_names[modes.index(mode)]
    print("selected " + colored(mode_name + " mode", "blue", style="bright"))

    done = []
    for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"):
        offset = depth * ("|" + ind)
        print(offset)

        # when the dep is a workflow, preload its branch map which updates branch parameters
        if isinstance(dep, BaseWorkflow):
            dep.get_branch_map()

        print("{}> fetch output of {}".format(offset, dep.repr(color=True)))
        offset += "|" + ind

        if not include_external and isinstance(dep, ExternalTask):
            print(offset + colored(" task is external", "yellow"))
            continue

        if dep in done:
            print(offset + colored(" outputs already fetched", "yellow"))
            continue

        if mode == "i":
            task_mode = query_choice(offset + " fetch outputs?",
                                     ("y", "n", "a"),
                                     default="y",
                                     descriptions=["yes", "no", "all"])
            if task_mode == "n":
                print(offset + colored(" skipped", "yellow"))
                continue

        done.append(dep)

        # start the traversing through output structure with a lookup pattern
        for output, odepth, oprefix, ooffset, lookup in _iter_output(
                dep.output(), offset):
            try:
                stat = output.stat
            except:
                stat = None

            target_line = "{} {}{}".format(ooffset, oprefix,
                                           output.repr(color=True))
            if stat:
                target_line += " ({:.2f} {})".format(
                    *human_bytes(stat.st_size))
            print(target_line)

            if not isinstance(output, TargetCollection) and stat is None:
                print(ooffset + ind + colored(" not existing, skip", "yellow"))
                continue

            is_copyable = callable(getattr(output, "copy_to_local", None))
            if not isinstance(output, TargetCollection) and not is_copyable:
                print(ooffset + ind +
                      colored(" not a file target, skip", "yellow"))
                continue

            if mode == "d":
                print(ooffset + ind + colored(" dry fetched", "yellow"))
                continue

            to_fetch = [output]

            if mode == "i" and task_mode != "a":
                if isinstance(output, TargetCollection):
                    coll_choice = query_choice(
                        ooffset + ind + "fetch?", ("y", "n", "i"),
                        default="y",
                        descriptions=["yes", "no", "interactive"])
                    if coll_choice == "i":
                        lookup[:0] = _flatten_output(output.targets,
                                                     odepth + 1)
                        continue
                    else:
                        target_choice = coll_choice
                    to_fetch = list(output._flat_target_list)
                else:
                    target_choice = query_choice(ooffset + ind + "fetch?",
                                                 ("y", "n"),
                                                 default="y",
                                                 descriptions=["yes", "no"])
                if target_choice == "n":
                    print(ooffset + ind + colored(" skipped", "yellow"))
                    continue

            for outp in to_fetch:
                if not callable(getattr(outp, "copy_to_local", None)):
                    continue

                basename = "{}__{}".format(dep.live_task_id, outp.basename)
                outp.copy_to_local(os.path.join(target_dir, basename))

                print("{}{} {} ({})".format(
                    ooffset, ind, colored("fetched", "green", style="bright"),
                    basename))
Ejemplo n.º 8
0
def fetch_task_output(task, max_depth=0, mode=None, target_dir=".", include_external=False):
    from law.task.base import ExternalTask
    from law.workflow.base import BaseWorkflow

    max_depth = int(max_depth)
    print("fetch task output with max_depth {}".format(max_depth))

    target_dir = os.path.normpath(os.path.abspath(target_dir))
    print("target directory is {}".format(target_dir))
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    include_external = check_bool_flag(include_external)
    if include_external:
        print("include external tasks")

    # determine the mode, i.e., all, dry, interactive
    modes = ["i", "a", "d"]
    mode_names = ["interactive", "all", "dry"]
    if mode is None:
        mode = query_choice("fetch mode?", modes, default="i", descriptions=mode_names)
    elif isinstance(mode, int):
        mode = modes[mode]
    else:
        mode = mode[0].lower()
    if mode not in modes:
        raise Exception("unknown removal mode '{}'".format(mode))
    mode_name = mode_names[modes.index(mode)]
    print("selected " + colored(mode_name + " mode", "blue", style="bright"))

    done = []
    ind = "|   "
    for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"):
        offset = depth * ind
        print(offset)

        # when the dep is a workflow, preload its branch map which updates branch parameters
        if isinstance(dep, BaseWorkflow):
            dep.get_branch_map()

        print("{}> fetch output of {}".format(offset, dep.repr(color=True)))
        offset += ind

        if not include_external and isinstance(dep, ExternalTask):
            print(offset + "- " + colored("task is external, skip", "yellow"))
            continue

        if dep in done:
            print(offset + "- " + colored("outputs already fetched", "yellow"))
            continue

        if mode == "i":
            task_mode = query_choice(offset + "  walk through outputs?", ("y", "n"),
                default="y")
            if task_mode == "n":
                continue

        done.append(dep)

        outputs = flatten(
            (outp._flat_target_list if isinstance(outp, TargetCollection) else outp)
            for outp in flatten(dep.output())
        )
        for outp in outputs:
            try:
                stat = outp.stat
            except:
                stat = None

            target_line = "{}- {}".format(offset, outp.repr(color=True))
            if stat:
                target_line += " ({:.2f} {})".format(*human_bytes(stat.st_size))
            print(target_line)

            def print_skip(reason):
                text = reason + ", skip"
                print(offset + "  " + colored(text, color="yellow", style="bright"))

            if stat is None:
                print_skip("not existing")
                continue

            if not callable(getattr(outp, "copy_to_local", None)):
                print_skip("not a file target")
                continue

            if mode == "d":
                print("{}  {}".format(offset, colored("dry fetched", "yellow")))
                continue

            elif mode == "i":
                q = offset + "  fetch?"
                if query_choice(q, ("y", "n"), default="y") == "n":
                    print(offset + "  " + colored("skipped", "yellow"))
                    continue

            basename = "{}__{}".format(dep.live_task_id, outp.basename)
            outp.copy_to_local(os.path.join(target_dir, basename))

            print("{}  {} ({})".format(offset, colored("fetched", "green", style="bright"),
                basename))