Beispiel #1
0
    def prepare_stageout(self, tmp_dir):
        # get the sandbox stage-out mask
        stageout_mask = self.task.sandbox_stageout()
        if not stageout_mask:
            return None

        # determine outputs as seen from outside and within the sandbox
        outputs = self.task.output()
        with patch_object(os, "environ", self.task.env, lock=True):
            sandbox_outputs = self.task.output()

        # apply the mask to both structs
        outputs = mask_struct(stageout_mask, outputs)
        sandbox_outputs = mask_struct(stageout_mask, sandbox_outputs)
        if not outputs:
            return None

        # define the stage-out directory
        cfg = Config.instance()
        section = self.sandbox_inst.get_config_section()
        stageout_dir = tmp_dir.child(cfg.get_expanded(section, "stageout_dir"),
                                     type="d")
        stageout_dir.touch()

        # create a lookup for input -> sandbox input
        sandbox_targets = dict(zip(flatten(outputs), flatten(sandbox_outputs)))

        return StageInfo(outputs, stageout_dir, sandbox_targets)
Beispiel #2
0
    def stagein(self, tmp_dir):
        # check if the stage-in dir is set
        cfg = Config.instance()
        section = self.sandbox_inst.get_config_section()
        stagein_dir_name = cfg.get_expanded(section, "stagein_dir_name")
        if not stagein_dir_name:
            return None

        # get the sandbox stage-in mask
        stagein_mask = self.task.sandbox_stagein()
        if not stagein_mask:
            return None

        # determine inputs as seen from outside and within the sandbox
        inputs = self.task.input()
        with patch_object(os, "environ", self.task.env, lock=True):
            sandbox_inputs = self.task.input()

        # apply the mask to both structs
        inputs = mask_struct(stagein_mask, inputs)
        sandbox_inputs = mask_struct(stagein_mask, sandbox_inputs)
        if not inputs:
            return None

        # create a lookup for input -> sandbox input
        sandbox_targets = dict(zip(flatten(inputs), flatten(sandbox_inputs)))

        # create the stage-in directory
        stagein_dir = tmp_dir.child(stagein_dir_name, type="d")
        stagein_dir.touch()

        # create the structure of staged inputs
        def stagein_target(target):
            sandbox_target = sandbox_targets[target]
            staged_target = make_staged_target(stagein_dir, sandbox_target)
            logger.debug("stage-in {} to {}".format(target.path,
                                                    staged_target.path))
            target.copy_to_local(staged_target)
            return staged_target

        def map_collection(func, collection, **kwargs):
            map_struct(func, collection.targets, **kwargs)

        staged_inputs = map_struct(
            stagein_target,
            inputs,
            custom_mappings={TargetCollection: map_collection})

        logger.info("staged-in {} file(s)".format(len(stagein_dir.listdir())))

        return StageInfo(inputs, stagein_dir, staged_inputs)
Beispiel #3
0
Datei: bash.py Projekt: riga/law
    def cmd(self, proxy_cmd):
        # environment variables to set
        env = self._get_env()

        # add staging directories
        if self.stagein_info:
            env["LAW_SANDBOX_STAGEIN_DIR"] = self.stagein_info.stage_dir.path
        if self.stageout_info:
            env["LAW_SANDBOX_STAGEOUT_DIR"] = self.stageout_info.stage_dir.path

        # get the bash command
        bash_cmd = self._bash_cmd()

        # build commands to setup the environment
        setup_cmds = self._build_setup_cmds(env)

        # handle local scheduling within the container
        if self.force_local_scheduler():
            proxy_cmd.add_arg("--local-scheduler", "True", overwrite=True)

        # build the final command
        cmd = quote_cmd(bash_cmd + [
            "-c",
            "; ".join(
                flatten("source \"{}\" \"\"".format(self.script), setup_cmds,
                        proxy_cmd.build())),
        ])

        return cmd
Beispiel #4
0
def _flatten_output(output, depth):
    if isinstance(output, (list, tuple, set)) or is_lazy_iterable(output):
        return [(outp, depth, "{}: ".format(i)) for i, outp in enumerate(output)]
    elif isinstance(output, dict):
        return [(outp, depth, "{}: ".format(k)) for k, outp in six.iteritems(output)]
    else:
        return [(outp, depth, "") for outp in flatten(output)]
Beispiel #5
0
def print_task_status(task, max_depth=0, target_depth=0, flags=None):
    max_depth = int(max_depth)
    target_depth = int(target_depth)
    if flags:
        flags = tuple(flags.lower().split("-"))

    print("print task status with max_depth {} and target_depth {}".format(
        max_depth, target_depth))

    done = []
    ind = "|   "
    for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"):
        offset = depth * ind
        print(offset)
        print("{}> check status of {}".format(offset, dep.repr(color=True)))
        offset += ind

        if dep in done:
            print(offset + "- " + colored("outputs already checked", "yellow"))
            continue

        done.append(dep)

        for outp in flatten(dep.output()):
            print("{}- {}".format(offset, outp.repr(color=True)))

            status_text = outp.status_text(max_depth=target_depth,
                                           flags=flags,
                                           color=True)
            status_lines = status_text.split("\n")
            status_text = status_lines[0]
            for line in status_lines[1:]:
                status_text += "\n" + offset + "     " + line
            print("{}  -> {}".format(offset, status_text))
Beispiel #6
0
def load(*packages):
    """
    Loads contrib *packages* and adds them to the law namespace. Example:

    .. code-block:: python

        import law
        law.contrib.load("docker")

        law.docker.DockerSandbox(...)

    It is ensured that packages are loaded only once.
    """
    for pkg in flatten(packages):
        if pkg in loaded_packages:
            logger.debug(
                "skip contrib package '{}', already loaded".format(pkg))
            continue
        elif not os.path.exists(law_src_path("contrib", pkg, "__init__.py")):
            raise Exception("contrib package '{}' does not exist".format(pkg))
        elif getattr(law, pkg, None):
            raise Exception(
                "cannot load contrib package '{}', attribute with that name already "
                "exists in the law module".format(pkg))

        mod = __import__("law.contrib.{}".format(pkg), globals(), locals(),
                         [pkg])
        setattr(law, pkg, mod)
        law.__all__.append(pkg)
        loaded_packages.append(pkg)

        logger.debug("loaded contrib package '{}'".format(pkg))
Beispiel #7
0
    def cmd(self, proxy_cmd):
        # environment variables to set
        env = self._get_env()

        # add staging directories
        if self.stagein_info:
            env["LAW_SANDBOX_STAGEIN_DIR"] = self.stagein_info.stage_dir.path
        if self.stageout_info:
            env["LAW_SANDBOX_STAGEOUT_DIR"] = self.stageout_info.stage_dir.path

        # build commands to setup the environment
        setup_cmds = self._build_setup_cmds(env)

        # handle scheduling within the container
        ls_flag = "--local-scheduler"
        if self.force_local_scheduler() and ls_flag not in proxy_cmd:
            proxy_cmd.append(ls_flag)

        # build the final command
        cmd = quote_cmd([
            "bash", "-l", "-c", "; ".join(
                flatten("source \"{}\"".format(self.script), setup_cmds,
                        " ".join(proxy_cmd)))
        ])

        return cmd
Beispiel #8
0
    def complete(self):
        outputs = [t for t in flatten(self.output()) if not t.optional]

        if len(outputs) == 0:
            logger.warning("task {!r} has either no non-optional outputs or no custom complete() "
                "method".format(self))
            return False

        return all(t.exists() for t in outputs)
Beispiel #9
0
    def complete(self):
        outputs = [t for t in flatten(self.output()) if not t.optional]

        if len(outputs) == 0:
            msg = "task {!r} has either no non-optional outputs or no custom complete() method"
            warnings.warn(msg.format(self), stacklevel=2)
            return False

        return all(t.exists() for t in outputs)
Beispiel #10
0
def flatten_collections(*targets):
    lookup = flatten(targets)
    targets = []

    while lookup:
        t = lookup.pop(0)
        if isinstance(t, TargetCollection):
            lookup[:0] = t._flat_target_list
        else:
            targets.append(t)

    return targets
Beispiel #11
0
    def __init__(self, targets, threshold=1.0, **kwargs):
        if isinstance(targets, types.GeneratorType):
            targets = list(targets)
        elif not isinstance(targets, (list, tuple, dict)):
            raise TypeError("invalid targets, must be of type: list, tuple, dict")

        Target.__init__(self, **kwargs)

        # store targets and threshold
        self.targets = targets
        self.threshold = threshold

        # store flat targets per element in the input structure of targets
        if isinstance(targets, (list, tuple)):
            gen = (flatten(v) for v in targets)
        else:  # dict
            gen = ((k, flatten(v)) for k, v in six.iteritems(targets))
        self._flat_targets = targets.__class__(gen)

        # also store an entirely flat list of targets for simplified iterations
        self._flat_target_list = flatten(targets)
Beispiel #12
0
def print_task_output(task, max_depth=0):
    max_depth = int(max_depth)

    print("print task output with max_depth {}\n".format(max_depth))

    done = []
    for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"):
        done.append(dep)

        for outp in flatten(dep.output()):
            for uri in make_list(outp.uri()):
                print(uri)
Beispiel #13
0
    def env(self):
        # strategy: create a tempfile, forward it to a container, let python dump its full env,
        # close the container and load the env file
        if self.image not in self._envs:
            tmp = LocalFileTarget(is_tmp=".env")
            tmp.touch()

            env_file = os.path.join("/tmp", tmp.unique_basename)

            # get the docker run command
            docker_run_cmd = self._docker_run_cmd()

            # mount the env file
            docker_run_cmd.extend(["-v", "{}:{}".format(tmp.path, env_file)])

            # build commands to setup the environment
            setup_cmds = self._build_setup_cmds(self._get_env())

            # build the python command that dumps the environment
            py_cmd = "import os,pickle;" \
                + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(env_file)

            # build the full command
            cmd = quote_cmd(docker_run_cmd + [
                self.image,
                "bash",
                "-l",
                "-c",
                "; ".join(
                    flatten(setup_cmds, quote_cmd(["python", "-c", py_cmd]))),
            ])

            # run it
            code, out, _ = interruptable_popen(cmd,
                                               shell=True,
                                               executable="/bin/bash",
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.STDOUT)
            if code != 0:
                raise Exception(
                    "docker sandbox env loading failed:\n{}".format(out))

            # load the environment from the tmp file
            env = tmp.load(formatter="pickle")

            # cache
            self._envs[self.image] = env

        return self._envs[self.image]
Beispiel #14
0
def print_task_output(task, max_depth=0, scheme=True):
    max_depth = int(max_depth)
    scheme = flag_to_bool(scheme)

    print("print task output with max_depth {}, {} schemes\n".format(
        max_depth, "showing" if scheme else "hiding"))

    done = []
    for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"):
        done.append(dep)

        for outp in flatten(dep.output()):
            kwargs = {}
            if isinstance(outp, (FileSystemTarget, FileCollection)):
                kwargs = {"scheme": scheme}
            for uri in make_list(outp.uri(**kwargs)):
                print(uri)
Beispiel #15
0
    def __init__(self, targets, threshold=1.0, **kwargs):
        if not isinstance(targets, (list, tuple, dict)):
            raise TypeError(
                "invalid targets, must be of type: list, tuple, dict")

        super(TargetCollection, self).__init__(**kwargs)

        self.targets = targets
        self.threshold = threshold

        _flatten = lambda v: flatten(v.flat_targets
                                     if isinstance(v, TargetCollection) else v)
        if isinstance(targets, (list, tuple)):
            gen = (_flatten(v) for v in targets)
        else:  # dict
            gen = ((k, _flatten(v)) for k, v in targets.items())
        self.flat_targets = targets.__class__(gen)
Beispiel #16
0
Datei: bash.py Projekt: riga/law
    def env(self):
        # strategy: create a tempfile, let python dump its full env in a subprocess and load the
        # env file again afterwards
        script = self.script
        if script not in self._envs:
            with tmp_file() as tmp:
                tmp_path = os.path.realpath(tmp[1])

                # get the bash command
                bash_cmd = self._bash_cmd()

                # build commands to setup the environment
                setup_cmds = self._build_setup_cmds(self._get_env())

                # build the python command that dumps the environment
                py_cmd = "import os,pickle;" \
                    + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(tmp_path)

                # build the full command
                cmd = quote_cmd(bash_cmd + [
                    "-c",
                    "; ".join(
                        flatten("source \"{}\" \"\"".format(
                            self.script), setup_cmds,
                                quote_cmd(["python", "-c", py_cmd]))),
                ])

                # run it
                returncode = interruptable_popen(cmd,
                                                 shell=True,
                                                 executable="/bin/bash")[0]
                if returncode != 0:
                    raise Exception("bash sandbox env loading failed")

                # load the environment from the tmp file
                pickle_kwargs = {"encoding": "utf-8"} if six.PY3 else {}
                with open(tmp_path, "rb") as f:
                    env = collections.OrderedDict(
                        six.moves.cPickle.load(f, **pickle_kwargs))

            # cache it
            self._envs[script] = env

        return self._envs[script]
Beispiel #17
0
    def env(self):
        # strategy: create a tempfile, forward it to a container, let python dump its full env,
        # close the container and load the env file
        if self.image not in self._envs:
            with tmp_file() as tmp:
                tmp_path = os.path.realpath(tmp[1])
                env_path = os.path.join("/tmp", str(hash(tmp_path))[-8:])

                # build commands to setup the environment
                setup_cmds = self._build_setup_cmds(self._get_env())

                # arguments to configure the environment
                args = ["-v", "{}:{}".format(tmp_path, env_path)
                        ] + self.common_args()

                # build the command
                py_cmd = "import os,pickle;" \
                    + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(env_path)
                cmd = quote_cmd(["docker", "run"] + args + [
                    self.image,
                    "bash",
                    "-l",
                    "-c",
                    "; ".join(
                        flatten(setup_cmds, quote_cmd(["python", "-c", py_cmd
                                                       ]))),
                ])

                # run it
                returncode = interruptable_popen(cmd,
                                                 shell=True,
                                                 executable="/bin/bash")[0]
                if returncode != 0:
                    raise Exception("docker sandbox env loading failed")

                # load the environment from the tmp file
                with open(tmp_path, "rb") as f:
                    env = six.moves.cPickle.load(f)

            # cache
            self._envs[self.image] = env

        return self._envs[self.image]
Beispiel #18
0
    def walk_deps(self, max_depth=-1, order="level"):
        # see https://en.wikipedia.org/wiki/Tree_traversal
        if order not in ("level", "pre"):
            raise ValueError("unknown traversal order '{}', use 'level' or 'pre'".format(order))

        tasks = [(self, 0)]
        while len(tasks):
            task, depth = tasks.pop(0)
            if max_depth >= 0 and depth > max_depth:
                continue
            deps = flatten(task.requires())

            yield (task, deps, depth)

            deps = ((d, depth + 1) for d in deps)
            if order == "level":
                tasks[len(tasks):] = deps
            elif order == "pre":
                tasks[:0] = deps
Beispiel #19
0
def print_task_output(task, max_depth=0):
    max_depth = int(max_depth)

    print("print task output with max_depth {}\n".format(max_depth))

    def print_target(target):
        if isinstance(target, FileSystemTarget):
            print(target.uri())
        else:
            logger.warning("target listing not yet implemented for {}".format(target.__class__))

    done = []
    for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"):
        done.append(dep)

        for outp in flatten(dep.output()):
            if isinstance(outp, TargetCollection):
                for t in outp._flat_target_list:
                    print_target(t)
            else:
                print_target(outp)
Beispiel #20
0
def print_task_status(task, max_depth=0, target_depth=0, flags=None):
    from law.workflow.base import BaseWorkflow

    max_depth = int(max_depth)
    target_depth = int(target_depth)
    if flags:
        flags = tuple(flags.lower().split("-"))

    print("print task status with max_depth {} and target_depth {}".format(
        max_depth, target_depth))

    done = []
    ind = "|   "
    for dep, _, depth in task.walk_deps(max_depth=max_depth, order="pre"):
        offset = depth * ind
        print(offset)

        # when the dep is a workflow, preload its branch map which updates branch parameters
        if isinstance(dep, BaseWorkflow):
            dep.get_branch_map()

        print("{}> check status of {}".format(offset, dep.repr(color=True)))
        offset += ind

        if dep in done:
            print(offset + "- " + colored("outputs already checked", "yellow"))
            continue

        done.append(dep)

        for outp in flatten(dep.output()):
            print("{}- {}".format(offset, outp.repr(color=True)))

            status_text = outp.status_text(max_depth=target_depth, flags=flags, color=True)
            status_lines = status_text.split("\n")
            status_text = status_lines[0]
            for line in status_lines[1:]:
                status_text += "\n{}  {}".format(offset, line)
            print("{}  {}".format(offset, status_text))
Beispiel #21
0
    def output(self):
        output = self.merge_output()

        if self.is_forest():
            return output

        if isinstance(output, (list, tuple, TargetCollection)):
            output = output[self.tree_index]

        if self.is_root():
            return output

        # get the directory in which intermediate outputs are stored
        if isinstance(output, SiblingFileCollection):
            intermediate_dir = output.dir
        else:
            first_output = flatten(output)[0]
            if not isinstance(first_output, FileSystemTarget):
                raise Exception(
                    "cannot determine directory for intermediate merged outputs from "
                    "'{}'".format(output))
            intermediate_dir = first_output.parent

        # helper to create an intermediate output
        def get_intermediate_output(leaf_output):
            name, ext = os.path.splitext(leaf_output.basename)
            basename = self.node_format.format(name=name,
                                               ext=ext,
                                               tree=self.tree_index,
                                               branch=self.branch,
                                               depth=self.tree_depth)
            return intermediate_dir.child(basename, type="f")

        # return intermediate outputs in the same structure
        if isinstance(output, TargetCollection):
            return output.map(get_intermediate_output)
        return map_struct(get_intermediate_output, output)
Beispiel #22
0
def load(*packages):
    """
    Loads contrib *packages* and adds members exposed in ``__all__`` to the law main module.
    Example:

    .. code-block:: python

        import law
        law.contrib.load("numpy")

        print(law.NumpyFormatter)
        # -> <class 'law.contrib.numpy.formatter.NumpyFormatter'>

    It is ensured that packages are loaded only once.
    """
    for pkg in flatten(packages):
        if pkg in loaded_packages:
            logger.debug(
                "skip contrib package '{}', already loaded".format(pkg))
            continue
        loaded_packages.append(pkg)

        mod = __import__("law.contrib.{}".format(pkg), globals(), locals(),
                         [pkg])
        logger.debug("loaded contrib package '{}'".format(pkg))

        for attr in mod.__all__:
            if hasattr(law, attr):
                logger.info(
                    "cannot register 'law.contrib.{0}.{1}' to 'law.{1}', "
                    "already exists".format(pkg, attr))
            else:
                setattr(law, attr, getattr(mod, attr))
                law.__all__.append(attr)
                logger.debug(
                    "registered 'law.contrib.{0}.{1}' to 'law.{1}'".format(
                        pkg, attr))
Beispiel #23
0
def load(*packages):
    """
    Loads contrib *packages* and adds them to the law namespace. Effectively, this removes the
    necessity of having ``contrib`` module in imports or when accessing members. Example:

    .. code-block:: python

        import law
        law.contrib.load("slack")

        print(law.slack.NotifySlackParameter)  # instead of law.contrib.slack.NotifySlackParameter
        # -> <class '...'>

    It is ensured that packages are loaded only once.
    """
    for pkg in flatten(packages):
        if pkg in loaded_packages:
            logger.debug("skip contrib package '{}', already loaded".format(pkg))
            continue
        elif not os.path.exists(law_src_path("contrib", pkg, "__init__.py")):
            raise Exception("contrib package '{}' does not exist".format(pkg))
        elif getattr(law, pkg, None):
            raise Exception("cannot load contrib package '{}', attribute with that name already "
                "exists on the law module".format(pkg))

        mod = __import__("law.contrib.{}".format(pkg), globals(), locals(), [pkg])
        setattr(law, pkg, mod)
        law.__all__.append(pkg)
        loaded_packages.append(pkg)

        logger.debug("loaded contrib package '{}'".format(pkg))

        # the contrib mechanism used to add all members of the module to the main law namespace
        # but given the growing number of contrib packages, the chance of collisions is not
        # negligible any longer, so for the moment add dummy objects only for callables to the law
        # module that, when used, raise verbose exceptions
        # (to be removed for v0.1)
        def dummy_factory(pkg, attr, member):
            def _raise():
                raise AttributeError("due to a change in 'law.contrib.load()', the attribute '{0}' "
                    "is no longer accessible on the global 'law' namespace, please use "
                    "'law.{1}.{0}' instead".format(attr, pkg))

            if isinstance(member, types.FunctionType):
                def dummy(*args, **kwargs):
                    """
                    Dummy function throwing an *AttributeError* when called.
                    """
                    _raise()
            else:
                class dummy(member):
                    """
                    Dummy class throwing an *AttributeError* when instantiated.
                    """
                    exclude_index = True
                    name = str(uuid.uuid4())
                    def __new__(cls, *args, **kwargs):
                        _raise()

            return dummy

        for attr in mod.__all__:
            member = getattr(mod, attr)
            if callable(member):
                setattr(law, attr, dummy_factory(pkg, attr, member))
            else:
                logger.debug("skip creating dummy object for attribute {} of package {}".format(
                    attr, pkg))
Beispiel #24
0
 def complete(self):
     if self.is_forest():
         return all(task.complete() for task in flatten(self.requires()))
     else:
         return super(CascadeMerge, self).complete()
Beispiel #25
0
    def cmd(self, proxy_cmd):
        cfg = Config.instance()

        # docker run command arguments
        args = []

        # add args configured on the task
        args_getter = getattr(self.task, "docker_args", None)
        args += make_list(args_getter() if callable(args_getter) else self.
                          default_docker_args)

        # container name
        args.extend([
            "--name", "{}_{}".format(self.task.task_id,
                                     str(uuid.uuid4())[:8])
        ])

        # container hostname
        args.extend(["-h", "{}".format(socket.gethostname())])

        # helper to build forwarded paths
        section = self.get_config_section()
        forward_dir = cfg.get_expanded(section, "forward_dir")
        python_dir = cfg.get_expanded(section, "python_dir")
        bin_dir = cfg.get_expanded(section, "bin_dir")
        stagein_dir = cfg.get_expanded(section, "stagein_dir")
        stageout_dir = cfg.get_expanded(section, "stageout_dir")

        def dst(*args):
            return os.path.join(forward_dir, *(str(arg) for arg in args))

        # helper for mounting a volume
        volume_srcs = []

        def mount(*vol):
            src = vol[0]

            # make sure, the same source directory is not mounted twice
            if src in volume_srcs:
                return
            volume_srcs.append(src)

            # ensure that source directories exist
            if not os.path.isfile(src) and not os.path.exists(src):
                os.makedirs(src)

            # store the mount point
            args.extend(["-v", ":".join(vol)])

        # environment variables to set
        env = self._get_env()

        # add staging directories
        if self.stagein_info:
            env["LAW_SANDBOX_STAGEIN_DIR"] = dst(stagein_dir)
            mount(self.stagein_info.stage_dir.path, dst(stagein_dir))
        if self.stageout_info:
            env["LAW_SANDBOX_STAGEOUT_DIR"] = dst(stageout_dir)
            mount(self.stageout_info.stage_dir.path, dst(stageout_dir))

        # prevent python from writing byte code files
        env["PYTHONDONTWRITEBYTECODE"] = "1"

        # adjust path variables
        env["PATH"] = os.pathsep.join(["$PATH", dst("bin")])
        env["PYTHONPATH"] = os.pathsep.join(["$PYTHONPATH", dst(python_dir)])

        # forward python directories of law and dependencies
        for mod in law_deps:
            path = os.path.dirname(mod.__file__)
            name, ext = os.path.splitext(os.path.basename(mod.__file__))
            if name == "__init__":
                vsrc = path
                vdst = dst(python_dir, os.path.basename(path))
            else:
                vsrc = os.path.join(path, name + ".py")
                vdst = dst(python_dir, name + ".py")
            mount(vsrc, vdst)

        # forward the law cli dir to bin as it contains a law executable
        env["PATH"] = os.pathsep.join(
            [env["PATH"], dst(python_dir, "law", "cli")])

        # forward the law config file
        if cfg.config_file:
            mount(cfg.config_file, dst("law.cfg"))
            env["LAW_CONFIG_FILE"] = dst("law.cfg")

        # forward the luigi config file
        for p in luigi.configuration.LuigiConfigParser._config_paths[::-1]:
            if os.path.exists(p):
                mount(p, dst("luigi.cfg"))
                env["LUIGI_CONFIG_PATH"] = dst("luigi.cfg")
                break

        # forward volumes defined in the config and by the task
        vols = self._get_volumes()
        for hdir, cdir in six.iteritems(vols):
            if not cdir:
                mount(hdir)
            else:
                cdir = cdir.replace("${PY}", dst(python_dir)).replace(
                    "${BIN}", dst(bin_dir))
                mount(hdir, cdir)

        # extend by arguments needed for both env loading and executing the job
        args.extend(self.common_args())

        # build commands to setup the environment
        setup_cmds = self._build_setup_cmds(env)

        # handle scheduling within the container
        ls_flag = "--local-scheduler"
        if self.force_local_scheduler() and ls_flag not in proxy_cmd:
            proxy_cmd.append(ls_flag)
        if ls_flag not in proxy_cmd:
            # when the scheduler runs on the host system, we need to set the network interace to the
            # host system and set the correct luigi scheduler host as seen by the container
            if self.scheduler_on_host():
                args.extend(["--network", "host"])
                proxy_cmd.extend(
                    ["--scheduler-host", "{}".format(self.get_host_ip())])

        # build the final command
        cmd = quote_cmd(["docker", "run"] + args + [
            self.image, "bash", "-l", "-c", "; ".join(
                flatten(setup_cmds, " ".join(proxy_cmd)))
        ])

        return cmd
Beispiel #26
0
    def cleanup_batch(self,
                      job_ids,
                      threads=None,
                      chunk_size=None,
                      callback=None,
                      **kwargs):
        """
        Cleans up a batch of jobs given by *job_ids* via a thread pool of size *threads* which
        defaults to its instance attribute. When *chunk_size*, which defaults to
        :py:attr:`chunk_size_cleanup`, is not negative, *job_ids* are split into chunks of that size
        which are passed to :py:meth:`cleanup`. When *callback* is set, it is invoked after each
        successful job (or job chunk) cleaning with the index of the corresponding job id (starting
        at 0) and either *None* or an exception if any occurred. All other *kwargs* are passed to
        :py:meth:`cleanup`.

        Exceptions that occured during job cleaning are stored in a list and returned. An empty list
        means that no exceptions occured.
        """
        # default arguments
        threads = max(threads or self.threads or 1, 1)

        # is chunking allowed?
        if self.chunk_size_cleanup:
            chunk_size = max(chunk_size or self.chunk_size_cleanup, 0)
        else:
            chunk_size = 0
        chunking = chunk_size > 0

        # build chunks (either job ids one by one, or real chunks of job ids)
        job_ids = make_list(job_ids)
        chunks = list(iter_chunks(job_ids,
                                  chunk_size)) if chunking else job_ids

        # factory to call the passed callback for each job id even when chunking
        def cb_factory(i):
            if not callable(callback):
                return None
            elif chunking:

                def wrapper(err):
                    offset = sum(len(chunk) for chunk in chunks[:i])
                    for j in range(len(chunks[i])):
                        callback(offset + j, err)

                return wrapper
            else:

                def wrapper(err):
                    callback(i, err)

                return wrapper

        # threaded processing
        pool = ThreadPool(threads)
        results = [
            pool.apply_async(self.cleanup, (v, ),
                             kwargs,
                             callback=cb_factory(i))
            for i, v in enumerate(chunks)
        ]
        pool.close()
        pool.join()

        # store errors
        errors = filter(
            bool, flatten(get_async_result_silent(res) for res in results))

        return errors
Beispiel #27
0
    def submit_batch(self,
                     job_files,
                     threads=None,
                     chunk_size=None,
                     callback=None,
                     **kwargs):
        """
        Submits a batch of jobs given by *job_files* via a thread pool of size *threads* which
        defaults to its instance attribute. When *chunk_size*, which defaults to
        :py:attr:`chunk_size_submit`, is not negative, *job_files* are split into chunks of that
        size which are passed to :py:meth:`submit`. When *callback* is set, it is invoked after each
        successful job submission with the index of the corresponding job file (starting at 0) and
        either the assigned job id or an exception if any occurred. All other *kwargs* are passed to
        :py:meth:`submit`.

        The return value is a list containing the return values of the particular :py:meth:`submit`
        calls, in an order that corresponds to *job_files*. When an exception was raised during a
        submission, this exception is added to the returned list.
        """
        # default arguments
        threads = max(threads or self.threads or 1, 1)

        # is chunking allowed?
        if self.chunk_size_submit:
            chunk_size = max(chunk_size or self.chunk_size_submit, 0)
        else:
            chunk_size = 0
        chunking = chunk_size > 0

        # build chunks (either job files one by one, or real chunks of job files)
        job_files = make_list(job_files)
        chunks = list(iter_chunks(job_files,
                                  chunk_size)) if chunking else job_files

        # factory to call the passed callback for each job file even when chunking
        def cb_factory(i):
            if not callable(callback):
                return None
            elif chunking:

                def wrapper(job_ids):
                    offset = sum(len(chunk) for chunk in chunks[:i])
                    for j in range(len(chunks[i])):
                        job_id = job_ids if isinstance(
                            job_ids, Exception) else job_ids[j]
                        callback(offset + j, job_id)

                return wrapper
            else:

                def wrapper(job_id):
                    callback(i, job_id)

                return wrapper

        # threaded processing
        pool = ThreadPool(threads)
        results = [
            pool.apply_async(self.submit, (v, ),
                             kwargs,
                             callback=cb_factory(i))
            for i, v in enumerate(chunks)
        ]
        pool.close()
        pool.join()

        # store return values or errors, same length as job files, independent of chunking
        if chunking:
            outputs = []
            for i, (chunk, res) in enumerate(six.moves.zip(chunks, results)):
                job_ids = get_async_result_silent(res)
                if isinstance(job_ids, Exception):
                    job_ids = len(chunk) * [job_ids]
                outputs.extend(job_ids)
        else:
            outputs = flatten(get_async_result_silent(res) for res in results)

        return outputs
Beispiel #28
0
    def cmd(self, proxy_cmd):
        # singularity exec command arguments
        # -e clears the environment
        args = ["-e"]

        # helper to build forwarded paths
        cfg = Config.instance()
        cfg_section = self.get_config_section()
        forward_dir = cfg.get_expanded(cfg_section, "forward_dir")
        python_dir = cfg.get_expanded(cfg_section, "python_dir")
        bin_dir = cfg.get_expanded(cfg_section, "bin_dir")
        stagein_dir_name = cfg.get_expanded(cfg_section, "stagein_dir_name")
        stageout_dir_name = cfg.get_expanded(cfg_section, "stageout_dir_name")

        def dst(*args):
            return os.path.join(forward_dir, *(str(arg) for arg in args))

        # helper for mounting a volume
        volume_srcs = []

        def mount(*vol):
            src = vol[0]

            # make sure, the same source directory is not mounted twice
            if src in volume_srcs:
                return
            volume_srcs.append(src)

            # ensure that source directories exist
            if not os.path.isfile(src):
                makedirs(src)

            # store the mount point
            args.extend(["-B", ":".join(vol)])

        # determine whether volume binding is allowed
        allow_binds_cb = getattr(self.task, "singularity_allow_binds", None)
        if callable(allow_binds_cb):
            allow_binds = allow_binds_cb()
        else:
            allow_binds = cfg.get_expanded(cfg_section, "allow_binds")

        # determine whether law software forwarding is allowed
        forward_law_cb = getattr(self.task, "singularity_forward_law", None)
        if callable(forward_law_cb):
            forward_law = forward_law_cb()
        else:
            forward_law = cfg.get_expanded(cfg_section, "forward_law")

        # environment variables to set
        env = self._get_env()

        # prevent python from writing byte code files
        env["PYTHONDONTWRITEBYTECODE"] = "1"

        if forward_law:
            # adjust path variables
            if allow_binds:
                env["PATH"] = os.pathsep.join([dst("bin"), "$PATH"])
                env["PYTHONPATH"] = os.pathsep.join(
                    [dst(python_dir), "$PYTHONPATH"])
            else:
                env["PATH"] = "$PATH"
                env["PYTHONPATH"] = "$PYTHONPATH"

            # forward python directories of law and dependencies
            for mod in law_deps:
                path = os.path.dirname(mod.__file__)
                name, ext = os.path.splitext(os.path.basename(mod.__file__))
                if name == "__init__":
                    vsrc = path
                    vdst = dst(python_dir, os.path.basename(path))
                else:
                    vsrc = os.path.join(path, name + ".py")
                    vdst = dst(python_dir, name + ".py")
                if allow_binds:
                    mount(vsrc, vdst)
                else:
                    dep_path = os.path.dirname(vsrc)
                    if dep_path not in env["PYTHONPATH"].split(os.pathsep):
                        env["PYTHONPATH"] = os.pathsep.join(
                            [dep_path, env["PYTHONPATH"]])

            # forward the law cli dir to bin as it contains a law executable
            if allow_binds:
                env["PATH"] = os.pathsep.join(
                    [dst(python_dir, "law", "cli"), env["PATH"]])
            else:
                env["PATH"] = os.pathsep.join(
                    [law_src_path("cli"), env["PATH"]])

            # forward the law config file
            if cfg.config_file:
                if allow_binds:
                    mount(cfg.config_file, dst("law.cfg"))
                    env["LAW_CONFIG_FILE"] = dst("law.cfg")
                else:
                    env["LAW_CONFIG_FILE"] = cfg.config_file

            # forward the luigi config file
            for p in luigi.configuration.LuigiConfigParser._config_paths[::-1]:
                if os.path.exists(p):
                    if allow_binds:
                        mount(p, dst("luigi.cfg"))
                        env["LUIGI_CONFIG_PATH"] = dst("luigi.cfg")
                    else:
                        env["LUIGI_CONFIG_PATH"] = p
                    break

        # add staging directories
        if (self.stagein_info or self.stageout_info) and not allow_binds:
            raise Exception(
                "cannot use stage-in or -out if binds are not allowed")

        if self.stagein_info:
            env["LAW_SANDBOX_STAGEIN_DIR"] = dst(stagein_dir_name)
            mount(self.stagein_info.stage_dir.path, dst(stagein_dir_name))
        if self.stageout_info:
            env["LAW_SANDBOX_STAGEOUT_DIR"] = dst(stageout_dir_name)
            mount(self.stageout_info.stage_dir.path, dst(stageout_dir_name))

        # forward volumes defined in the config and by the task
        vols = self._get_volumes()
        if vols and not allow_binds:
            raise Exception(
                "cannot forward volumes to sandbox if binds are not allowed")

        for hdir, cdir in six.iteritems(vols):
            if not cdir:
                mount(hdir)
            else:
                cdir = self._expand_volume(cdir,
                                           bin_dir=dst(bin_dir),
                                           python_dir=dst(python_dir))
                mount(hdir, cdir)

        # handle local scheduling within the container
        if self.force_local_scheduler():
            proxy_cmd.add_arg("--local-scheduler", "True", overwrite=True)

        # get the singularity exec command, add arguments from above
        singularity_exec_cmd = self._singularity_exec_cmd() + args

        # build commands to set up environment
        setup_cmds = self._build_setup_cmds(env)

        # build the final command
        cmd = quote_cmd(singularity_exec_cmd + [
            self.image,
            "bash",
            "-l",
            "-c",
            "; ".join(flatten(setup_cmds, proxy_cmd.build())),
        ])

        return cmd
Beispiel #29
0
 def remove(self, silent=True):
     for target in flatten(self.flat_targets):
         target.remove(silent=silent)
Beispiel #30
0
    def env(self):
        # strategy: unlike docker, singularity might not allow binding of paths that do not exist
        # in the container, so create a tmp directory on the host system and bind it as /tmp, let
        # python dump its full env into a file, and read the file again on the host system
        if self.image not in self._envs:
            tmp_dir = LocalDirectoryTarget(is_tmp=True)
            tmp_dir.touch()

            tmp = tmp_dir.child("env", type="f")
            tmp.touch()

            # determine whether volume binding is allowed
            allow_binds_cb = getattr(self.task, "singularity_allow_binds",
                                     None)
            if callable(allow_binds_cb):
                allow_binds = allow_binds_cb()
            else:
                cfg = Config.instance()
                allow_binds = cfg.get_expanded(self.get_config_section(),
                                               "allow_binds")

            # arguments to configure the environment
            args = ["-e"]
            if allow_binds:
                args.extend(["-B", "{}:/tmp".format(tmp_dir.path)])
                env_file = "/tmp/{}".format(tmp.basename)
            else:
                env_file = tmp.path

            # get the singularity exec command
            singularity_exec_cmd = self._singularity_exec_cmd() + args

            # build commands to setup the environment
            setup_cmds = self._build_setup_cmds(self._get_env())

            # build the python command that dumps the environment
            py_cmd = "import os,pickle;" \
                + "pickle.dump(dict(os.environ),open('{}','wb'),protocol=2)".format(env_file)

            # build the full command
            cmd = quote_cmd(singularity_exec_cmd + [
                self.image,
                "bash",
                "-l",
                "-c",
                "; ".join(
                    flatten(setup_cmds, quote_cmd(["python", "-c", py_cmd]))),
            ])

            # run it
            code, out, _ = interruptable_popen(cmd,
                                               shell=True,
                                               executable="/bin/bash",
                                               stdout=subprocess.PIPE,
                                               stderr=subprocess.STDOUT)
            if code != 0:
                raise Exception(
                    "singularity sandbox env loading failed:\n{}".format(out))

            # load the environment from the tmp file
            env = tmp.load(formatter="pickle")

            # cache
            self._envs[self.image] = env

        return self._envs[self.image]