def allocate(self, size): logger.debug("allocating {0[0]:.2f} {0[1]} in cache '{1}'".format( human_bytes(size), self)) with self._lock_global(): # determine stats and current cache size file_stats = [] for elem in os.listdir(self.base): if elem.endswith(self.lock_postfix): continue cpath = os.path.join(self.base, elem) file_stats.append((cpath, os.stat(cpath))) current_size = sum(stat.st_size for _, stat in file_stats) # get the available space of the disk that contains the cache fs_stat = os.statvfs(self.base) full_size = fs_stat.f_frsize * fs_stat.f_blocks free_size = fs_stat.f_frsize * fs_stat.f_bavail # leave 10% total free space free_size -= 0.1 * full_size full_size *= 0.9 # make sure max_size is always smaller than what is actually possible if self.max_size < 0: max_size = current_size + free_size else: max_size = min(self.max_size, current_size + free_size) # determine the size of files that need to be deleted delete_size = current_size + size - max_size if delete_size <= 0: logger.debug( "cache space sufficient, {0[0]:.2f} {0[1]} bytes remaining" .format(human_bytes(-delete_size))) return logger.debug("need to delete {0[0]:.2f} {0[1]} bytes".format( human_bytes(delete_size))) # delete files, ordered by their access time, skip locked ones for cpath, cstat in sorted(file_stats, key=lambda tpl: tpl[1].st_atime): if self._is_locked(cpath): continue self._remove(cpath) delete_size -= cstat.st_size if delete_size <= 0: break else: logger.warning( "could not allocate remaining {0[0]:.2f} {0[1]}".format( human_bytes(delete_size)))
def allocate(self, size): logger.debug("allocating {0[0]:.2f} {0[1]} in cache '{1}'".format( human_bytes(size), self)) # determine stats and current cache size file_stats = [] for elem in os.listdir(self.base): if elem.endswith(self.lock_postfix): continue cpath = os.path.join(self.base, elem) file_stats.append((cpath, os.stat(cpath))) current_size = sum(stat.st_size for _, stat in file_stats) # get the available space of the disk that contains the cache in bytes, leave 10% fs_stat = os.statvfs(self.base) free_size = fs_stat.f_frsize * fs_stat.f_bavail * 0.9 # determine the maximum size of the cache # make sure it is always smaller than what is available if self.max_size <= 0: max_size = current_size + free_size else: max_size = min(self.max_size * 1024**2, current_size + free_size) # determine the size of files that need to be deleted delete_size = current_size + size - max_size if delete_size <= 0: logger.debug( "cache space sufficient, {0[0]:.2f} {0[1]} remaining".format( human_bytes(-delete_size))) return True logger.info("need to delete {0[0]:.2f} {0[1]} from cache".format( human_bytes(delete_size))) # delete files, ordered by their access time, skip locked ones for cpath, cstat in sorted(file_stats, key=lambda tpl: tpl[1].st_atime): if self._is_locked(cpath): continue self._remove(cpath) delete_size -= cstat.st_size if delete_size <= 0: return True logger.warning( "could not allocate remaining {0[0]:.2f} {0[1]} in cache".format( human_bytes(delete_size))) return False
def serialize(self, value): """""" if not value: value = 0 value_bytes = parse_bytes(value, input_unit=self.unit, unit="bytes") v, u = human_bytes(value_bytes, unit=self.unit) return "{}{}".format(try_int(v), u)
def _repr_pairs(self, color=True): pairs = super(FileSystemTarget, self)._repr_pairs() # add the path cfg = Config.instance() expand = cfg.get_expanded_boolean("target", "expand_path_repr") pairs.append(("path", self.path if expand else self.unexpanded_path)) # optionally add the file size if cfg.get_expanded_boolean("target", "filesize_repr"): stat = self.exists(stat=True) pairs.append(("size", human_bytes(stat.st_size, fmt="{:.1f}{}") if stat else "-")) return pairs
def merge(inputs, output): with task.publish_step("merging {} parquet files ...".format( len(inputs)), runtime=True): # clear the output if necessary if output.exists() and force: output.remove() if len(inputs) == 1: output.copy_from_local(inputs[0]) else: # merge merge_parquet_files([inp.path for inp in inputs], output.path, writer_opts=writer_opts) # print the size output_size = human_bytes(output.stat().st_size, fmt=True) task.publish_message(f"merged file size: {output_size}")
def hadd_task(task, inputs, output, cwd=None, local=False, force=True): """ This method is intended to be used by tasks that are supposed to merge root files, e.g. when inheriting from :py:class:`law.contrib.tasks.MergeCascade`. *inputs* should be a sequence of local targets that represent the files to merge into *output*. *cwd* is the working directory in which hadd is invoked. When empty, a temporary directory is used. The *task* itself is used to print and publish messages via its :py:meth:`law.Task.publish_message` and :py:meth:`law.Task.publish_step` methods. When *local* is *True*, the input and output targets are assumed to be local and the merging is based on their local paths. Otherwise, the targets are fetched first and the output target is localized. When *force* is *True*, any existing output file is overwritten (by adding the ``-f`` flag to ``hadd``). """ # ensure inputs are targets inputs = [ LocalFileTarget(inp) if isinstance(inp, six.string_types) else inp for inp in inputs ] # ensure output is a target if isinstance(output, six.string_types): output = LocalFileTarget(output) # default cwd if not cwd: cwd = LocalDirectoryTarget(is_tmp=True) elif isinstance(cwd, six.string_types): cwd = LocalDirectoryTarget(cwd) cwd.touch() # helper to create the hadd cmd def hadd_cmd(input_paths, output_path): cmd = ["hadd", "-n", "0"] if force: cmd.append("-f") cmd.extend(["-d", cwd.path]) cmd.append(output_path) cmd.extend(input_paths) return quote_cmd(cmd) if local: # when local, there is no need to download inputs input_paths = [inp.path for inp in inputs] with task.publish_step("merging ...", runtime=True): if len(inputs) == 1: output.copy_from_local(inputs[0]) else: # merge using hadd cmd = hadd_cmd(input_paths, output.path) code = interruptable_popen(cmd, shell=True, executable="/bin/bash")[0] if code != 0: raise Exception("hadd failed") task.publish_message("merged file size: {}".format(human_bytes( output.stat.st_size, fmt=True))) else: # when not local, we need to fetch files first into the cwd with task.publish_step("fetching inputs ...", runtime=True): def fetch(inp): inp.copy_to_local(cwd.child(inp.unique_basename, type="f"), cache=False) return inp.unique_basename def callback(i): task.publish_message("fetch file {} / {}".format(i + 1, len(inputs))) bases = map_verbose(fetch, inputs, every=5, callback=callback) # start merging into the localized output with output.localize("w", cache=False) as tmp_out: with task.publish_step("merging ...", runtime=True): if len(bases) == 1: tmp_out.path = cwd.child(bases[0]).path else: # merge using hadd cmd = hadd_cmd(bases, tmp_out.path) code = interruptable_popen(cmd, shell=True, executable="/bin/bash", cwd=cwd.path)[0] if code != 0: raise Exception("hadd failed") task.publish_message("merged file size: {}".format(human_bytes( tmp_out.stat.st_size, fmt=True)))
def fetch_task_output(task, max_depth=0, mode=None, target_dir=".", include_external=False): from law.task.base import ExternalTask from law.workflow.base import BaseWorkflow max_depth = int(max_depth) print("fetch task output with max_depth {}".format(max_depth)) target_dir = os.path.normpath(os.path.abspath(target_dir)) print("target directory is {}".format(target_dir)) if not os.path.exists(target_dir): os.makedirs(target_dir) include_external = flag_to_bool(include_external) if include_external: print("include external tasks") # determine the mode, i.e., all, dry, interactive modes = ["i", "a", "d"] mode_names = ["interactive", "all", "dry"] if mode is None: mode = query_choice("fetch mode?", modes, default="i", descriptions=mode_names) elif isinstance(mode, int): mode = modes[mode] else: mode = mode[0].lower() if mode not in modes: raise Exception("unknown removal mode '{}'".format(mode)) mode_name = mode_names[modes.index(mode)] print("selected " + colored(mode_name + " mode", "blue", style="bright")) done = [] for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"): offset = depth * ("|" + ind) print(offset) # when the dep is a workflow, preload its branch map which updates branch parameters if isinstance(dep, BaseWorkflow): dep.get_branch_map() print("{}> fetch output of {}".format(offset, dep.repr(color=True))) offset += "|" + ind if not include_external and isinstance(dep, ExternalTask): print(offset + colored(" task is external", "yellow")) continue if dep in done: print(offset + colored(" outputs already fetched", "yellow")) continue if mode == "i": task_mode = query_choice(offset + " fetch outputs?", ("y", "n", "a"), default="y", descriptions=["yes", "no", "all"]) if task_mode == "n": print(offset + colored(" skipped", "yellow")) continue done.append(dep) # start the traversing through output structure with a lookup pattern for output, odepth, oprefix, ooffset, lookup in _iter_output( dep.output(), offset): try: stat = output.stat except: stat = None target_line = "{} {}{}".format(ooffset, oprefix, output.repr(color=True)) if stat: target_line += " ({:.2f} {})".format( *human_bytes(stat.st_size)) print(target_line) if not isinstance(output, TargetCollection) and stat is None: print(ooffset + ind + colored(" not existing, skip", "yellow")) continue is_copyable = callable(getattr(output, "copy_to_local", None)) if not isinstance(output, TargetCollection) and not is_copyable: print(ooffset + ind + colored(" not a file target, skip", "yellow")) continue if mode == "d": print(ooffset + ind + colored(" dry fetched", "yellow")) continue to_fetch = [output] if mode == "i" and task_mode != "a": if isinstance(output, TargetCollection): coll_choice = query_choice( ooffset + ind + "fetch?", ("y", "n", "i"), default="y", descriptions=["yes", "no", "interactive"]) if coll_choice == "i": lookup[:0] = _flatten_output(output.targets, odepth + 1) continue else: target_choice = coll_choice to_fetch = list(output._flat_target_list) else: target_choice = query_choice(ooffset + ind + "fetch?", ("y", "n"), default="y", descriptions=["yes", "no"]) if target_choice == "n": print(ooffset + ind + colored(" skipped", "yellow")) continue for outp in to_fetch: if not callable(getattr(outp, "copy_to_local", None)): continue basename = "{}__{}".format(dep.live_task_id, outp.basename) outp.copy_to_local(os.path.join(target_dir, basename)) print("{}{} {} ({})".format( ooffset, ind, colored("fetched", "green", style="bright"), basename))
def fetch_task_output(task, max_depth=0, mode=None, target_dir=".", include_external=False): from law.task.base import ExternalTask from law.workflow.base import BaseWorkflow max_depth = int(max_depth) print("fetch task output with max_depth {}".format(max_depth)) target_dir = os.path.normpath(os.path.abspath(target_dir)) print("target directory is {}".format(target_dir)) if not os.path.exists(target_dir): os.makedirs(target_dir) include_external = check_bool_flag(include_external) if include_external: print("include external tasks") # determine the mode, i.e., all, dry, interactive modes = ["i", "a", "d"] mode_names = ["interactive", "all", "dry"] if mode is None: mode = query_choice("fetch mode?", modes, default="i", descriptions=mode_names) elif isinstance(mode, int): mode = modes[mode] else: mode = mode[0].lower() if mode not in modes: raise Exception("unknown removal mode '{}'".format(mode)) mode_name = mode_names[modes.index(mode)] print("selected " + colored(mode_name + " mode", "blue", style="bright")) done = [] ind = "| " for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"): offset = depth * ind print(offset) # when the dep is a workflow, preload its branch map which updates branch parameters if isinstance(dep, BaseWorkflow): dep.get_branch_map() print("{}> fetch output of {}".format(offset, dep.repr(color=True))) offset += ind if not include_external and isinstance(dep, ExternalTask): print(offset + "- " + colored("task is external, skip", "yellow")) continue if dep in done: print(offset + "- " + colored("outputs already fetched", "yellow")) continue if mode == "i": task_mode = query_choice(offset + " walk through outputs?", ("y", "n"), default="y") if task_mode == "n": continue done.append(dep) outputs = flatten( (outp._flat_target_list if isinstance(outp, TargetCollection) else outp) for outp in flatten(dep.output()) ) for outp in outputs: try: stat = outp.stat except: stat = None target_line = "{}- {}".format(offset, outp.repr(color=True)) if stat: target_line += " ({:.2f} {})".format(*human_bytes(stat.st_size)) print(target_line) def print_skip(reason): text = reason + ", skip" print(offset + " " + colored(text, color="yellow", style="bright")) if stat is None: print_skip("not existing") continue if not callable(getattr(outp, "copy_to_local", None)): print_skip("not a file target") continue if mode == "d": print("{} {}".format(offset, colored("dry fetched", "yellow"))) continue elif mode == "i": q = offset + " fetch?" if query_choice(q, ("y", "n"), default="y") == "n": print(offset + " " + colored("skipped", "yellow")) continue basename = "{}__{}".format(dep.live_task_id, outp.basename) outp.copy_to_local(os.path.join(target_dir, basename)) print("{} {} ({})".format(offset, colored("fetched", "green", style="bright"), basename))