def prepare_stageout(self, tmp_dir): # get the sandbox stage-out mask stageout_mask = self.task.sandbox_stageout() if not stageout_mask: return None # determine outputs as seen from outside and within the sandbox outputs = self.task.output() with patch_object(os, "environ", self.task.env, lock=True): sandbox_outputs = self.task.output() # apply the mask to both structs outputs = mask_struct(stageout_mask, outputs) sandbox_outputs = mask_struct(stageout_mask, sandbox_outputs) if not outputs: return None # define the stage-out directory cfg = Config.instance() section = self.sandbox_inst.get_config_section() stageout_dir = tmp_dir.child(cfg.get_expanded(section, "stageout_dir"), type="d") stageout_dir.touch() # create a lookup for input -> sandbox input sandbox_targets = dict(zip(flatten(outputs), flatten(sandbox_outputs))) return StageInfo(outputs, stageout_dir, sandbox_targets)
def stagein(self, tmp_dir): # check if the stage-in dir is set cfg = Config.instance() section = self.sandbox_inst.get_config_section() stagein_dir_name = cfg.get_expanded(section, "stagein_dir_name") if not stagein_dir_name: return None # get the sandbox stage-in mask stagein_mask = self.task.sandbox_stagein() if not stagein_mask: return None # determine inputs as seen from outside and within the sandbox inputs = self.task.input() with patch_object(os, "environ", self.task.env, lock=True): sandbox_inputs = self.task.input() # apply the mask to both structs inputs = mask_struct(stagein_mask, inputs) sandbox_inputs = mask_struct(stagein_mask, sandbox_inputs) if not inputs: return None # create a lookup for input -> sandbox input sandbox_targets = dict(zip(flatten(inputs), flatten(sandbox_inputs))) # create the stage-in directory stagein_dir = tmp_dir.child(stagein_dir_name, type="d") stagein_dir.touch() # create the structure of staged inputs def stagein_target(target): sandbox_target = sandbox_targets[target] staged_target = make_staged_target(stagein_dir, sandbox_target) logger.debug("stage-in {} to {}".format(target.path, staged_target.path)) target.copy_to_local(staged_target) return staged_target def map_collection(func, collection, **kwargs): map_struct(func, collection.targets, **kwargs) staged_inputs = map_struct( stagein_target, inputs, custom_mappings={TargetCollection: map_collection}) logger.info("staged-in {} file(s)".format(len(stagein_dir.listdir()))) return StageInfo(inputs, stagein_dir, staged_inputs)
def cmd(self, proxy_cmd): # environment variables to set env = self._get_env() # add staging directories if self.stagein_info: env["LAW_SANDBOX_STAGEIN_DIR"] = self.stagein_info.stage_dir.path if self.stageout_info: env["LAW_SANDBOX_STAGEOUT_DIR"] = self.stageout_info.stage_dir.path # get the bash command bash_cmd = self._bash_cmd() # build commands to setup the environment setup_cmds = self._build_setup_cmds(env) # handle local scheduling within the container if self.force_local_scheduler(): proxy_cmd.add_arg("--local-scheduler", "True", overwrite=True) # build the final command cmd = quote_cmd(bash_cmd + [ "-c", "; ".join( flatten("source \"{}\" \"\"".format(self.script), setup_cmds, proxy_cmd.build())), ]) return cmd
def _flatten_output(output, depth): if isinstance(output, (list, tuple, set)) or is_lazy_iterable(output): return [(outp, depth, "{}: ".format(i)) for i, outp in enumerate(output)] elif isinstance(output, dict): return [(outp, depth, "{}: ".format(k)) for k, outp in six.iteritems(output)] else: return [(outp, depth, "") for outp in flatten(output)]
def print_task_status(task, max_depth=0, target_depth=0, flags=None): max_depth = int(max_depth) target_depth = int(target_depth) if flags: flags = tuple(flags.lower().split("-")) print("print task status with max_depth {} and target_depth {}".format( max_depth, target_depth)) done = [] ind = "| " for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"): offset = depth * ind print(offset) print("{}> check status of {}".format(offset, dep.repr(color=True))) offset += ind if dep in done: print(offset + "- " + colored("outputs already checked", "yellow")) continue done.append(dep) for outp in flatten(dep.output()): print("{}- {}".format(offset, outp.repr(color=True))) status_text = outp.status_text(max_depth=target_depth, flags=flags, color=True) status_lines = status_text.split("\n") status_text = status_lines[0] for line in status_lines[1:]: status_text += "\n" + offset + " " + line print("{} -> {}".format(offset, status_text))
def load(*packages): """ Loads contrib *packages* and adds them to the law namespace. Example: .. code-block:: python import law law.contrib.load("docker") law.docker.DockerSandbox(...) It is ensured that packages are loaded only once. """ for pkg in flatten(packages): if pkg in loaded_packages: logger.debug( "skip contrib package '{}', already loaded".format(pkg)) continue elif not os.path.exists(law_src_path("contrib", pkg, "__init__.py")): raise Exception("contrib package '{}' does not exist".format(pkg)) elif getattr(law, pkg, None): raise Exception( "cannot load contrib package '{}', attribute with that name already " "exists in the law module".format(pkg)) mod = __import__("law.contrib.{}".format(pkg), globals(), locals(), [pkg]) setattr(law, pkg, mod) law.__all__.append(pkg) loaded_packages.append(pkg) logger.debug("loaded contrib package '{}'".format(pkg))
def cmd(self, proxy_cmd): # environment variables to set env = self._get_env() # add staging directories if self.stagein_info: env["LAW_SANDBOX_STAGEIN_DIR"] = self.stagein_info.stage_dir.path if self.stageout_info: env["LAW_SANDBOX_STAGEOUT_DIR"] = self.stageout_info.stage_dir.path # build commands to setup the environment setup_cmds = self._build_setup_cmds(env) # handle scheduling within the container ls_flag = "--local-scheduler" if self.force_local_scheduler() and ls_flag not in proxy_cmd: proxy_cmd.append(ls_flag) # build the final command cmd = quote_cmd([ "bash", "-l", "-c", "; ".join( flatten("source \"{}\"".format(self.script), setup_cmds, " ".join(proxy_cmd))) ]) return cmd
def complete(self): outputs = [t for t in flatten(self.output()) if not t.optional] if len(outputs) == 0: logger.warning("task {!r} has either no non-optional outputs or no custom complete() " "method".format(self)) return False return all(t.exists() for t in outputs)
def complete(self): outputs = [t for t in flatten(self.output()) if not t.optional] if len(outputs) == 0: msg = "task {!r} has either no non-optional outputs or no custom complete() method" warnings.warn(msg.format(self), stacklevel=2) return False return all(t.exists() for t in outputs)
def flatten_collections(*targets): lookup = flatten(targets) targets = [] while lookup: t = lookup.pop(0) if isinstance(t, TargetCollection): lookup[:0] = t._flat_target_list else: targets.append(t) return targets
def __init__(self, targets, threshold=1.0, **kwargs): if isinstance(targets, types.GeneratorType): targets = list(targets) elif not isinstance(targets, (list, tuple, dict)): raise TypeError("invalid targets, must be of type: list, tuple, dict") Target.__init__(self, **kwargs) # store targets and threshold self.targets = targets self.threshold = threshold # store flat targets per element in the input structure of targets if isinstance(targets, (list, tuple)): gen = (flatten(v) for v in targets) else: # dict gen = ((k, flatten(v)) for k, v in six.iteritems(targets)) self._flat_targets = targets.__class__(gen) # also store an entirely flat list of targets for simplified iterations self._flat_target_list = flatten(targets)
def print_task_output(task, max_depth=0): max_depth = int(max_depth) print("print task output with max_depth {}\n".format(max_depth)) done = [] for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"): done.append(dep) for outp in flatten(dep.output()): for uri in make_list(outp.uri()): print(uri)
def env(self): # strategy: create a tempfile, forward it to a container, let python dump its full env, # close the container and load the env file if self.image not in self._envs: tmp = LocalFileTarget(is_tmp=".env") tmp.touch() env_file = os.path.join("/tmp", tmp.unique_basename) # get the docker run command docker_run_cmd = self._docker_run_cmd() # mount the env file docker_run_cmd.extend(["-v", "{}:{}".format(tmp.path, env_file)]) # build commands to setup the environment setup_cmds = self._build_setup_cmds(self._get_env()) # build the python command that dumps the environment py_cmd = "import os,pickle;" \ + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(env_file) # build the full command cmd = quote_cmd(docker_run_cmd + [ self.image, "bash", "-l", "-c", "; ".join( flatten(setup_cmds, quote_cmd(["python", "-c", py_cmd]))), ]) # run it code, out, _ = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.STDOUT) if code != 0: raise Exception( "docker sandbox env loading failed:\n{}".format(out)) # load the environment from the tmp file env = tmp.load(formatter="pickle") # cache self._envs[self.image] = env return self._envs[self.image]
def print_task_output(task, max_depth=0, scheme=True): max_depth = int(max_depth) scheme = flag_to_bool(scheme) print("print task output with max_depth {}, {} schemes\n".format( max_depth, "showing" if scheme else "hiding")) done = [] for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"): done.append(dep) for outp in flatten(dep.output()): kwargs = {} if isinstance(outp, (FileSystemTarget, FileCollection)): kwargs = {"scheme": scheme} for uri in make_list(outp.uri(**kwargs)): print(uri)
def __init__(self, targets, threshold=1.0, **kwargs): if not isinstance(targets, (list, tuple, dict)): raise TypeError( "invalid targets, must be of type: list, tuple, dict") super(TargetCollection, self).__init__(**kwargs) self.targets = targets self.threshold = threshold _flatten = lambda v: flatten(v.flat_targets if isinstance(v, TargetCollection) else v) if isinstance(targets, (list, tuple)): gen = (_flatten(v) for v in targets) else: # dict gen = ((k, _flatten(v)) for k, v in targets.items()) self.flat_targets = targets.__class__(gen)
def env(self): # strategy: create a tempfile, let python dump its full env in a subprocess and load the # env file again afterwards script = self.script if script not in self._envs: with tmp_file() as tmp: tmp_path = os.path.realpath(tmp[1]) # get the bash command bash_cmd = self._bash_cmd() # build commands to setup the environment setup_cmds = self._build_setup_cmds(self._get_env()) # build the python command that dumps the environment py_cmd = "import os,pickle;" \ + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(tmp_path) # build the full command cmd = quote_cmd(bash_cmd + [ "-c", "; ".join( flatten("source \"{}\" \"\"".format( self.script), setup_cmds, quote_cmd(["python", "-c", py_cmd]))), ]) # run it returncode = interruptable_popen(cmd, shell=True, executable="/bin/bash")[0] if returncode != 0: raise Exception("bash sandbox env loading failed") # load the environment from the tmp file pickle_kwargs = {"encoding": "utf-8"} if six.PY3 else {} with open(tmp_path, "rb") as f: env = collections.OrderedDict( six.moves.cPickle.load(f, **pickle_kwargs)) # cache it self._envs[script] = env return self._envs[script]
def env(self): # strategy: create a tempfile, forward it to a container, let python dump its full env, # close the container and load the env file if self.image not in self._envs: with tmp_file() as tmp: tmp_path = os.path.realpath(tmp[1]) env_path = os.path.join("/tmp", str(hash(tmp_path))[-8:]) # build commands to setup the environment setup_cmds = self._build_setup_cmds(self._get_env()) # arguments to configure the environment args = ["-v", "{}:{}".format(tmp_path, env_path) ] + self.common_args() # build the command py_cmd = "import os,pickle;" \ + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(env_path) cmd = quote_cmd(["docker", "run"] + args + [ self.image, "bash", "-l", "-c", "; ".join( flatten(setup_cmds, quote_cmd(["python", "-c", py_cmd ]))), ]) # run it returncode = interruptable_popen(cmd, shell=True, executable="/bin/bash")[0] if returncode != 0: raise Exception("docker sandbox env loading failed") # load the environment from the tmp file with open(tmp_path, "rb") as f: env = six.moves.cPickle.load(f) # cache self._envs[self.image] = env return self._envs[self.image]
def walk_deps(self, max_depth=-1, order="level"): # see https://en.wikipedia.org/wiki/Tree_traversal if order not in ("level", "pre"): raise ValueError("unknown traversal order '{}', use 'level' or 'pre'".format(order)) tasks = [(self, 0)] while len(tasks): task, depth = tasks.pop(0) if max_depth >= 0 and depth > max_depth: continue deps = flatten(task.requires()) yield (task, deps, depth) deps = ((d, depth + 1) for d in deps) if order == "level": tasks[len(tasks):] = deps elif order == "pre": tasks[:0] = deps
def print_task_output(task, max_depth=0): max_depth = int(max_depth) print("print task output with max_depth {}\n".format(max_depth)) def print_target(target): if isinstance(target, FileSystemTarget): print(target.uri()) else: logger.warning("target listing not yet implemented for {}".format(target.__class__)) done = [] for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"): done.append(dep) for outp in flatten(dep.output()): if isinstance(outp, TargetCollection): for t in outp._flat_target_list: print_target(t) else: print_target(outp)
def print_task_status(task, max_depth=0, target_depth=0, flags=None): from law.workflow.base import BaseWorkflow max_depth = int(max_depth) target_depth = int(target_depth) if flags: flags = tuple(flags.lower().split("-")) print("print task status with max_depth {} and target_depth {}".format( max_depth, target_depth)) done = [] ind = "| " for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"): offset = depth * ind print(offset) # when the dep is a workflow, preload its branch map which updates branch parameters if isinstance(dep, BaseWorkflow): dep.get_branch_map() print("{}> check status of {}".format(offset, dep.repr(color=True))) offset += ind if dep in done: print(offset + "- " + colored("outputs already checked", "yellow")) continue done.append(dep) for outp in flatten(dep.output()): print("{}- {}".format(offset, outp.repr(color=True))) status_text = outp.status_text(max_depth=target_depth, flags=flags, color=True) status_lines = status_text.split("\n") status_text = status_lines[0] for line in status_lines[1:]: status_text += "\n{} {}".format(offset, line) print("{} {}".format(offset, status_text))
def output(self): output = self.merge_output() if self.is_forest(): return output if isinstance(output, (list, tuple, TargetCollection)): output = output[self.tree_index] if self.is_root(): return output # get the directory in which intermediate outputs are stored if isinstance(output, SiblingFileCollection): intermediate_dir = output.dir else: first_output = flatten(output)[0] if not isinstance(first_output, FileSystemTarget): raise Exception( "cannot determine directory for intermediate merged outputs from " "'{}'".format(output)) intermediate_dir = first_output.parent # helper to create an intermediate output def get_intermediate_output(leaf_output): name, ext = os.path.splitext(leaf_output.basename) basename = self.node_format.format(name=name, ext=ext, tree=self.tree_index, branch=self.branch, depth=self.tree_depth) return intermediate_dir.child(basename, type="f") # return intermediate outputs in the same structure if isinstance(output, TargetCollection): return output.map(get_intermediate_output) return map_struct(get_intermediate_output, output)
def load(*packages): """ Loads contrib *packages* and adds members exposed in ``__all__`` to the law main module. Example: .. code-block:: python import law law.contrib.load("numpy") print(law.NumpyFormatter) # -> <class 'law.contrib.numpy.formatter.NumpyFormatter'> It is ensured that packages are loaded only once. """ for pkg in flatten(packages): if pkg in loaded_packages: logger.debug( "skip contrib package '{}', already loaded".format(pkg)) continue loaded_packages.append(pkg) mod = __import__("law.contrib.{}".format(pkg), globals(), locals(), [pkg]) logger.debug("loaded contrib package '{}'".format(pkg)) for attr in mod.__all__: if hasattr(law, attr): logger.info( "cannot register 'law.contrib.{0}.{1}' to 'law.{1}', " "already exists".format(pkg, attr)) else: setattr(law, attr, getattr(mod, attr)) law.__all__.append(attr) logger.debug( "registered 'law.contrib.{0}.{1}' to 'law.{1}'".format( pkg, attr))
def load(*packages): """ Loads contrib *packages* and adds them to the law namespace. Effectively, this removes the necessity of having ``contrib`` module in imports or when accessing members. Example: .. code-block:: python import law law.contrib.load("slack") print(law.slack.NotifySlackParameter) # instead of law.contrib.slack.NotifySlackParameter # -> <class '...'> It is ensured that packages are loaded only once. """ for pkg in flatten(packages): if pkg in loaded_packages: logger.debug("skip contrib package '{}', already loaded".format(pkg)) continue elif not os.path.exists(law_src_path("contrib", pkg, "__init__.py")): raise Exception("contrib package '{}' does not exist".format(pkg)) elif getattr(law, pkg, None): raise Exception("cannot load contrib package '{}', attribute with that name already " "exists on the law module".format(pkg)) mod = __import__("law.contrib.{}".format(pkg), globals(), locals(), [pkg]) setattr(law, pkg, mod) law.__all__.append(pkg) loaded_packages.append(pkg) logger.debug("loaded contrib package '{}'".format(pkg)) # the contrib mechanism used to add all members of the module to the main law namespace # but given the growing number of contrib packages, the chance of collisions is not # negligible any longer, so for the moment add dummy objects only for callables to the law # module that, when used, raise verbose exceptions # (to be removed for v0.1) def dummy_factory(pkg, attr, member): def _raise(): raise AttributeError("due to a change in 'law.contrib.load()', the attribute '{0}' " "is no longer accessible on the global 'law' namespace, please use " "'law.{1}.{0}' instead".format(attr, pkg)) if isinstance(member, types.FunctionType): def dummy(*args, **kwargs): """ Dummy function throwing an *AttributeError* when called. """ _raise() else: class dummy(member): """ Dummy class throwing an *AttributeError* when instantiated. """ exclude_index = True name = str(uuid.uuid4()) def __new__(cls, *args, **kwargs): _raise() return dummy for attr in mod.__all__: member = getattr(mod, attr) if callable(member): setattr(law, attr, dummy_factory(pkg, attr, member)) else: logger.debug("skip creating dummy object for attribute {} of package {}".format( attr, pkg))
def complete(self): if self.is_forest(): return all(task.complete() for task in flatten(self.requires())) else: return super(CascadeMerge, self).complete()
def cmd(self, proxy_cmd): cfg = Config.instance() # docker run command arguments args = [] # add args configured on the task args_getter = getattr(self.task, "docker_args", None) args += make_list(args_getter() if callable(args_getter) else self. default_docker_args) # container name args.extend([ "--name", "{}_{}".format(self.task.task_id, str(uuid.uuid4())[:8]) ]) # container hostname args.extend(["-h", "{}".format(socket.gethostname())]) # helper to build forwarded paths section = self.get_config_section() forward_dir = cfg.get_expanded(section, "forward_dir") python_dir = cfg.get_expanded(section, "python_dir") bin_dir = cfg.get_expanded(section, "bin_dir") stagein_dir = cfg.get_expanded(section, "stagein_dir") stageout_dir = cfg.get_expanded(section, "stageout_dir") def dst(*args): return os.path.join(forward_dir, *(str(arg) for arg in args)) # helper for mounting a volume volume_srcs = [] def mount(*vol): src = vol[0] # make sure, the same source directory is not mounted twice if src in volume_srcs: return volume_srcs.append(src) # ensure that source directories exist if not os.path.isfile(src) and not os.path.exists(src): os.makedirs(src) # store the mount point args.extend(["-v", ":".join(vol)]) # environment variables to set env = self._get_env() # add staging directories if self.stagein_info: env["LAW_SANDBOX_STAGEIN_DIR"] = dst(stagein_dir) mount(self.stagein_info.stage_dir.path, dst(stagein_dir)) if self.stageout_info: env["LAW_SANDBOX_STAGEOUT_DIR"] = dst(stageout_dir) mount(self.stageout_info.stage_dir.path, dst(stageout_dir)) # prevent python from writing byte code files env["PYTHONDONTWRITEBYTECODE"] = "1" # adjust path variables env["PATH"] = os.pathsep.join(["$PATH", dst("bin")]) env["PYTHONPATH"] = os.pathsep.join(["$PYTHONPATH", dst(python_dir)]) # forward python directories of law and dependencies for mod in law_deps: path = os.path.dirname(mod.__file__) name, ext = os.path.splitext(os.path.basename(mod.__file__)) if name == "__init__": vsrc = path vdst = dst(python_dir, os.path.basename(path)) else: vsrc = os.path.join(path, name + ".py") vdst = dst(python_dir, name + ".py") mount(vsrc, vdst) # forward the law cli dir to bin as it contains a law executable env["PATH"] = os.pathsep.join( [env["PATH"], dst(python_dir, "law", "cli")]) # forward the law config file if cfg.config_file: mount(cfg.config_file, dst("law.cfg")) env["LAW_CONFIG_FILE"] = dst("law.cfg") # forward the luigi config file for p in luigi.configuration.LuigiConfigParser._config_paths[::-1]: if os.path.exists(p): mount(p, dst("luigi.cfg")) env["LUIGI_CONFIG_PATH"] = dst("luigi.cfg") break # forward volumes defined in the config and by the task vols = self._get_volumes() for hdir, cdir in six.iteritems(vols): if not cdir: mount(hdir) else: cdir = cdir.replace("${PY}", dst(python_dir)).replace( "${BIN}", dst(bin_dir)) mount(hdir, cdir) # extend by arguments needed for both env loading and executing the job args.extend(self.common_args()) # build commands to setup the environment setup_cmds = self._build_setup_cmds(env) # handle scheduling within the container ls_flag = "--local-scheduler" if self.force_local_scheduler() and ls_flag not in proxy_cmd: proxy_cmd.append(ls_flag) if ls_flag not in proxy_cmd: # when the scheduler runs on the host system, we need to set the network interace to the # host system and set the correct luigi scheduler host as seen by the container if self.scheduler_on_host(): args.extend(["--network", "host"]) proxy_cmd.extend( ["--scheduler-host", "{}".format(self.get_host_ip())]) # build the final command cmd = quote_cmd(["docker", "run"] + args + [ self.image, "bash", "-l", "-c", "; ".join( flatten(setup_cmds, " ".join(proxy_cmd))) ]) return cmd
def cleanup_batch(self, job_ids, threads=None, chunk_size=None, callback=None, **kwargs): """ Cleans up a batch of jobs given by *job_ids* via a thread pool of size *threads* which defaults to its instance attribute. When *chunk_size*, which defaults to :py:attr:`chunk_size_cleanup`, is not negative, *job_ids* are split into chunks of that size which are passed to :py:meth:`cleanup`. When *callback* is set, it is invoked after each successful job (or job chunk) cleaning with the index of the corresponding job id (starting at 0) and either *None* or an exception if any occurred. All other *kwargs* are passed to :py:meth:`cleanup`. Exceptions that occured during job cleaning are stored in a list and returned. An empty list means that no exceptions occured. """ # default arguments threads = max(threads or self.threads or 1, 1) # is chunking allowed? if self.chunk_size_cleanup: chunk_size = max(chunk_size or self.chunk_size_cleanup, 0) else: chunk_size = 0 chunking = chunk_size > 0 # build chunks (either job ids one by one, or real chunks of job ids) job_ids = make_list(job_ids) chunks = list(iter_chunks(job_ids, chunk_size)) if chunking else job_ids # factory to call the passed callback for each job id even when chunking def cb_factory(i): if not callable(callback): return None elif chunking: def wrapper(err): offset = sum(len(chunk) for chunk in chunks[:i]) for j in range(len(chunks[i])): callback(offset + j, err) return wrapper else: def wrapper(err): callback(i, err) return wrapper # threaded processing pool = ThreadPool(threads) results = [ pool.apply_async(self.cleanup, (v, ), kwargs, callback=cb_factory(i)) for i, v in enumerate(chunks) ] pool.close() pool.join() # store errors errors = filter( bool, flatten(get_async_result_silent(res) for res in results)) return errors
def submit_batch(self, job_files, threads=None, chunk_size=None, callback=None, **kwargs): """ Submits a batch of jobs given by *job_files* via a thread pool of size *threads* which defaults to its instance attribute. When *chunk_size*, which defaults to :py:attr:`chunk_size_submit`, is not negative, *job_files* are split into chunks of that size which are passed to :py:meth:`submit`. When *callback* is set, it is invoked after each successful job submission with the index of the corresponding job file (starting at 0) and either the assigned job id or an exception if any occurred. All other *kwargs* are passed to :py:meth:`submit`. The return value is a list containing the return values of the particular :py:meth:`submit` calls, in an order that corresponds to *job_files*. When an exception was raised during a submission, this exception is added to the returned list. """ # default arguments threads = max(threads or self.threads or 1, 1) # is chunking allowed? if self.chunk_size_submit: chunk_size = max(chunk_size or self.chunk_size_submit, 0) else: chunk_size = 0 chunking = chunk_size > 0 # build chunks (either job files one by one, or real chunks of job files) job_files = make_list(job_files) chunks = list(iter_chunks(job_files, chunk_size)) if chunking else job_files # factory to call the passed callback for each job file even when chunking def cb_factory(i): if not callable(callback): return None elif chunking: def wrapper(job_ids): offset = sum(len(chunk) for chunk in chunks[:i]) for j in range(len(chunks[i])): job_id = job_ids if isinstance( job_ids, Exception) else job_ids[j] callback(offset + j, job_id) return wrapper else: def wrapper(job_id): callback(i, job_id) return wrapper # threaded processing pool = ThreadPool(threads) results = [ pool.apply_async(self.submit, (v, ), kwargs, callback=cb_factory(i)) for i, v in enumerate(chunks) ] pool.close() pool.join() # store return values or errors, same length as job files, independent of chunking if chunking: outputs = [] for i, (chunk, res) in enumerate(six.moves.zip(chunks, results)): job_ids = get_async_result_silent(res) if isinstance(job_ids, Exception): job_ids = len(chunk) * [job_ids] outputs.extend(job_ids) else: outputs = flatten(get_async_result_silent(res) for res in results) return outputs
def cmd(self, proxy_cmd): # singularity exec command arguments # -e clears the environment args = ["-e"] # helper to build forwarded paths cfg = Config.instance() cfg_section = self.get_config_section() forward_dir = cfg.get_expanded(cfg_section, "forward_dir") python_dir = cfg.get_expanded(cfg_section, "python_dir") bin_dir = cfg.get_expanded(cfg_section, "bin_dir") stagein_dir_name = cfg.get_expanded(cfg_section, "stagein_dir_name") stageout_dir_name = cfg.get_expanded(cfg_section, "stageout_dir_name") def dst(*args): return os.path.join(forward_dir, *(str(arg) for arg in args)) # helper for mounting a volume volume_srcs = [] def mount(*vol): src = vol[0] # make sure, the same source directory is not mounted twice if src in volume_srcs: return volume_srcs.append(src) # ensure that source directories exist if not os.path.isfile(src): makedirs(src) # store the mount point args.extend(["-B", ":".join(vol)]) # determine whether volume binding is allowed allow_binds_cb = getattr(self.task, "singularity_allow_binds", None) if callable(allow_binds_cb): allow_binds = allow_binds_cb() else: allow_binds = cfg.get_expanded(cfg_section, "allow_binds") # determine whether law software forwarding is allowed forward_law_cb = getattr(self.task, "singularity_forward_law", None) if callable(forward_law_cb): forward_law = forward_law_cb() else: forward_law = cfg.get_expanded(cfg_section, "forward_law") # environment variables to set env = self._get_env() # prevent python from writing byte code files env["PYTHONDONTWRITEBYTECODE"] = "1" if forward_law: # adjust path variables if allow_binds: env["PATH"] = os.pathsep.join([dst("bin"), "$PATH"]) env["PYTHONPATH"] = os.pathsep.join( [dst(python_dir), "$PYTHONPATH"]) else: env["PATH"] = "$PATH" env["PYTHONPATH"] = "$PYTHONPATH" # forward python directories of law and dependencies for mod in law_deps: path = os.path.dirname(mod.__file__) name, ext = os.path.splitext(os.path.basename(mod.__file__)) if name == "__init__": vsrc = path vdst = dst(python_dir, os.path.basename(path)) else: vsrc = os.path.join(path, name + ".py") vdst = dst(python_dir, name + ".py") if allow_binds: mount(vsrc, vdst) else: dep_path = os.path.dirname(vsrc) if dep_path not in env["PYTHONPATH"].split(os.pathsep): env["PYTHONPATH"] = os.pathsep.join( [dep_path, env["PYTHONPATH"]]) # forward the law cli dir to bin as it contains a law executable if allow_binds: env["PATH"] = os.pathsep.join( [dst(python_dir, "law", "cli"), env["PATH"]]) else: env["PATH"] = os.pathsep.join( [law_src_path("cli"), env["PATH"]]) # forward the law config file if cfg.config_file: if allow_binds: mount(cfg.config_file, dst("law.cfg")) env["LAW_CONFIG_FILE"] = dst("law.cfg") else: env["LAW_CONFIG_FILE"] = cfg.config_file # forward the luigi config file for p in luigi.configuration.LuigiConfigParser._config_paths[::-1]: if os.path.exists(p): if allow_binds: mount(p, dst("luigi.cfg")) env["LUIGI_CONFIG_PATH"] = dst("luigi.cfg") else: env["LUIGI_CONFIG_PATH"] = p break # add staging directories if (self.stagein_info or self.stageout_info) and not allow_binds: raise Exception( "cannot use stage-in or -out if binds are not allowed") if self.stagein_info: env["LAW_SANDBOX_STAGEIN_DIR"] = dst(stagein_dir_name) mount(self.stagein_info.stage_dir.path, dst(stagein_dir_name)) if self.stageout_info: env["LAW_SANDBOX_STAGEOUT_DIR"] = dst(stageout_dir_name) mount(self.stageout_info.stage_dir.path, dst(stageout_dir_name)) # forward volumes defined in the config and by the task vols = self._get_volumes() if vols and not allow_binds: raise Exception( "cannot forward volumes to sandbox if binds are not allowed") for hdir, cdir in six.iteritems(vols): if not cdir: mount(hdir) else: cdir = self._expand_volume(cdir, bin_dir=dst(bin_dir), python_dir=dst(python_dir)) mount(hdir, cdir) # handle local scheduling within the container if self.force_local_scheduler(): proxy_cmd.add_arg("--local-scheduler", "True", overwrite=True) # get the singularity exec command, add arguments from above singularity_exec_cmd = self._singularity_exec_cmd() + args # build commands to set up environment setup_cmds = self._build_setup_cmds(env) # build the final command cmd = quote_cmd(singularity_exec_cmd + [ self.image, "bash", "-l", "-c", "; ".join(flatten(setup_cmds, proxy_cmd.build())), ]) return cmd
def remove(self, silent=True): for target in flatten(self.flat_targets): target.remove(silent=silent)
def env(self): # strategy: unlike docker, singularity might not allow binding of paths that do not exist # in the container, so create a tmp directory on the host system and bind it as /tmp, let # python dump its full env into a file, and read the file again on the host system if self.image not in self._envs: tmp_dir = LocalDirectoryTarget(is_tmp=True) tmp_dir.touch() tmp = tmp_dir.child("env", type="f") tmp.touch() # determine whether volume binding is allowed allow_binds_cb = getattr(self.task, "singularity_allow_binds", None) if callable(allow_binds_cb): allow_binds = allow_binds_cb() else: cfg = Config.instance() allow_binds = cfg.get_expanded(self.get_config_section(), "allow_binds") # arguments to configure the environment args = ["-e"] if allow_binds: args.extend(["-B", "{}:/tmp".format(tmp_dir.path)]) env_file = "/tmp/{}".format(tmp.basename) else: env_file = tmp.path # get the singularity exec command singularity_exec_cmd = self._singularity_exec_cmd() + args # build commands to setup the environment setup_cmds = self._build_setup_cmds(self._get_env()) # build the python command that dumps the environment py_cmd = "import os,pickle;" \ + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(env_file) # build the full command cmd = quote_cmd(singularity_exec_cmd + [ self.image, "bash", "-l", "-c", "; ".join( flatten(setup_cmds, quote_cmd(["python", "-c", py_cmd]))), ]) # run it code, out, _ = interruptable_popen(cmd, shell=True, executable="/bin/bash", stdout=subprocess.PIPE, stderr=subprocess.STDOUT) if code != 0: raise Exception( "singularity sandbox env loading failed:\n{}".format(out)) # load the environment from the tmp file env = tmp.load(formatter="pickle") # cache self._envs[self.image] = env return self._envs[self.image]