Beispiel #1
0
    def __init__(self, rule, dag, targetfile=None, format_wildcards=None):
        self.rule = rule
        self.dag = dag
        self.targetfile = targetfile

        self.wildcards_dict = self.rule.get_wildcards(targetfile)
        self.wildcards = Wildcards(fromdict=self.wildcards_dict)
        self._format_wildcards = (self.wildcards if format_wildcards is None
                                  else Wildcards(fromdict=format_wildcards))

        (self.input, self.output, self.params, self.log, self.benchmark,
         self.ruleio,
         self.dependencies) = rule.expand_wildcards(self.wildcards_dict)

        self.resources_dict = {}
        for name, res in rule.resources.items():
            if callable(res):
                res = res(self.wildcards)
                if not isinstance(res, int):
                    raise ValueError("Callable for resources must return int")
            self.resources_dict[name] = min(
                self.rule.workflow.global_resources.get(name, res), res)

        self.threads = self.resources_dict["_cores"]
        self.resources = Resources(fromdict=self.resources_dict)
        self.shadow_dir = None
        self._inputsize = None

        self.dynamic_output, self.dynamic_input = set(), set()
        self.temp_output, self.protected_output = set(), set()
        self.touch_output = set()
        self.subworkflow_input = dict()
        for f in self.output:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_output:
                self.dynamic_output.add(f)
            if f_ in self.rule.temp_output:
                self.temp_output.add(f)
            if f_ in self.rule.protected_output:
                self.protected_output.add(f)
            if f_ in self.rule.touch_output:
                self.touch_output.add(f)
        for f in self.input:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_input:
                self.dynamic_input.add(f)
            if f_ in self.rule.subworkflow_input:
                self.subworkflow_input[f] = self.rule.subworkflow_input[f_]
        self._hash = self.rule.__hash__()
        if True or not self.dynamic_output:
            for o in self.output:
                self._hash ^= o.__hash__()
Beispiel #2
0
    def resources(self):
        if self._resources is None:
            self._resources = defaultdict(int)
            self._resources["_nodes"] = 1
            pipe_group = any([job.is_pipe for job in self.jobs])
            # iterate over siblings that can be executed in parallel
            for siblings in self.toposorted:
                sibling_resources = defaultdict(int)
                for job in siblings:
                    try:
                        job_resources = job.resources
                    except FileNotFoundError:
                        # Skip job if resource evaluation leads to a file not found error.
                        # This will be caused by an inner job, which needs files created by the same group.
                        # All we can do is to ignore such jobs for now.
                        continue
                    for res, value in job_resources.items():
                        if res != "_nodes":
                            sibling_resources[res] += value

                for res, value in sibling_resources.items():
                    if res != "_nodes":
                        if self.dag.workflow.run_local or pipe_group:
                            # in case of local execution, this must be a
                            # group of jobs that are connected with pipes
                            # and have to run simultaneously
                            self._resources[res] += value
                        else:
                            # take the maximum with previous values
                            self._resources[res] = max(
                                self._resources.get(res, 0), value)

        return Resources(fromdict=self._resources)
Beispiel #3
0
    def expand_resources(self, wildcards, input, attempt):
        resources = dict()

        def apply(name, res, threads=None):
            if callable(res):
                aux = {"threads": threads} if threads is not None else dict()
                res = self.apply_input_function(res,
                                                wildcards,
                                                input=input,
                                                attempt=attempt,
                                                **aux)
                if not isinstance(res, int):
                    raise WorkflowError(
                        "Resources function did not return int.")
            res = min(self.workflow.global_resources.get(name, res), res)
            return res

        threads = apply("_cores", self.resources["_cores"])
        resources["_cores"] = threads

        for name, res in self.resources.items():
            if name != "_cores":
                resources[name] = apply(name, res)
        resources = Resources(fromdict=resources)
        return resources
Beispiel #4
0
 def resources(self):
     if self._resources is None:
         self._resources = defaultdict(int)
         pipe_group = any([
             any([is_flagged(o, "pipe") for o in job.output])
             for job in self.jobs
         ])
         for job in self.jobs:
             try:
                 job_resources = job.resources
             except FileNotFoundError:
                 # Skip job if resource evaluation leads to a file not found error.
                 # This will be caused by an inner job, which needs files created by the same group.
                 # All we can do is to ignore such jobs for now.
                 continue
             for res, value in job_resources.items():
                 if self.dag.workflow.run_local or pipe_group:
                     # in case of local execution, this must be a
                     # group of jobs that are connected with pipes
                     # and have to run simultaneously
                     self._resources[res] += value
                 else:
                     # take the maximum over all jobs
                     self._resources[res] = max(
                         self._resources.get(res, value), value)
     return Resources(fromdict=self._resources)
Beispiel #5
0
    def expand_resources(self, wildcards, input, attempt):
        resources = dict()

        def apply(name, res, threads=None):
            if callable(res):
                aux = dict(rulename=self.name)
                if threads:
                    aux["threads"] = threads
                res = self.apply_input_function(
                    res,
                    wildcards,
                    input=input,
                    attempt=attempt,
                    incomplete_checkpoint_func=lambda e: 0,
                    **aux)
                if not isinstance(res, int):
                    raise WorkflowError(
                        "Resources function did not return int.")
            res = min(self.workflow.global_resources.get(name, res), res)
            return res

        threads = apply("_cores", self.resources["_cores"])
        resources["_cores"] = threads

        for name, res in self.resources.items():
            if name != "_cores":
                resources[name] = apply(name, res, threads=threads)
        resources = Resources(fromdict=resources)
        return resources
Beispiel #6
0
    def expand_resources(self, wildcards, input, attempt):
        resources = dict()

        def apply(name, res, threads=None):
            if callable(res):
                aux = dict(rulename=self.name)
                if threads is not None:
                    aux["threads"] = threads
                try:
                    res, _ = self.apply_input_function(
                        res,
                        wildcards,
                        input=input,
                        attempt=attempt,
                        incomplete_checkpoint_func=lambda e: 0,
                        raw_exceptions=True,
                        **aux,
                    )
                except (Exception, BaseException) as e:
                    raise InputFunctionException(e,
                                                 rule=self,
                                                 wildcards=wildcards)

            if isinstance(res, float):
                # round to integer
                res = int(round(res))

            if not isinstance(res, int) and not isinstance(res, str):
                raise WorkflowError(
                    f"Resource {name} is neither int, float(would be rounded to nearest int), or str.",
                    rule=self,
                )

            global_res = self.workflow.global_resources.get(name)
            if global_res is not None:
                if not isinstance(res,
                                  TBDString) and type(res) != type(global_res):
                    global_type = ("an int" if isinstance(global_res, int) else
                                   type(global_res))
                    raise WorkflowError(
                        f"Resource {name} is of type {type(res).__name__} but global resource constraint "
                        f"defines {global_type} with value {global_res}. "
                        "Resources with the same name need to have the same types (int, float, or str are allowed).",
                        rule=self,
                    )
                if isinstance(res, int):
                    res = min(global_res, res)
            return res

        threads = apply("_cores", self.resources["_cores"])
        if self.workflow.max_threads is not None:
            threads = min(threads, self.workflow.max_threads)
        resources["_cores"] = threads

        for name, res in self.resources.items():
            if name != "_cores":
                resources[name] = apply(name, res, threads=threads)
        resources = Resources(fromdict=resources)
        return resources
Beispiel #7
0
    def __init__(self, rule, dag, targetfile=None, format_wildcards=None):
        self.rule = rule
        self.dag = dag
        self.targetfile = targetfile

        self.wildcards_dict = self.rule.get_wildcards(targetfile)
        self.wildcards = Wildcards(fromdict=self.wildcards_dict)
        self._format_wildcards = (self.wildcards if format_wildcards is None
                                  else Wildcards(fromdict=format_wildcards))

        (self.input, self.output, self.params, self.log, self.benchmark,
         self.ruleio,
         self.dependencies) = rule.expand_wildcards(self.wildcards_dict)

        self.resources_dict = {
            name: min(self.rule.workflow.global_resources.get(name, res), res)
            for name, res in rule.resources.items()
        }
        self.threads = self.resources_dict["_cores"]
        self.resources = Resources(fromdict=self.resources_dict)
        self._inputsize = None

        self.dynamic_output, self.dynamic_input = set(), set()
        self.temp_output, self.protected_output = set(), set()
        self.touch_output = set()
        self.subworkflow_input = dict()
        for f in self.output:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_output:
                self.dynamic_output.add(f)
            if f_ in self.rule.temp_output:
                self.temp_output.add(f)
            if f_ in self.rule.protected_output:
                self.protected_output.add(f)
            if f_ in self.rule.touch_output:
                self.touch_output.add(f)
        for f in self.input:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_input:
                self.dynamic_input.add(f)
            if f_ in self.rule.subworkflow_input:
                self.subworkflow_input[f] = self.rule.subworkflow_input[f_]
        self._hash = self.rule.__hash__()
        if True or not self.dynamic_output:
            for o in self.output:
                self._hash ^= o.__hash__()
Beispiel #8
0
 def expand_resources(self, wildcards, input):
     resources = dict()
     for name, res in self.resources.items():
         if callable(res):
             res = self.apply_input_function(res, wildcards, input=input)
             if not isinstance(res, int):
                 raise WorkflowError(
                     "Resources function did not return int.")
         res = min(self.workflow.global_resources.get(name, res), res)
         resources[name] = res
     resources = Resources(fromdict=resources)
     return resources
Beispiel #9
0
    def expand_resources(self, wildcards, input, attempt):
        resources = dict()

        def apply(name, res, threads=None):
            if callable(res):
                aux = dict(rulename=self.name)
                if threads:
                    aux["threads"] = threads
                try:
                    try:
                        res, _ = self.apply_input_function(
                            res,
                            wildcards,
                            input=input,
                            attempt=attempt,
                            incomplete_checkpoint_func=lambda e: 0,
                            raw_exceptions=True,
                            **aux)
                    except FileNotFoundError as e:
                        # Resources can depend on input files. Since expansion can happen during dryrun,
                        # where input files are not yet present, we need to skip such resources and
                        # mark them as [TBD].
                        if e.filename in input:
                            # use zero for resource if it cannot yet be determined
                            res = TBDInt(0)
                        else:
                            raise e
                except (Exception, BaseException) as e:
                    raise InputFunctionException(e,
                                                 rule=self,
                                                 wildcards=wildcards)

                if not isinstance(res, int) and not isinstance(res, str):
                    raise WorkflowError(
                        "Resources function did not return int or str.",
                        rule=self)
            if isinstance(res, int):
                global_res = self.workflow.global_resources.get(name, res)
                if global_res is not None:
                    res = min(global_res, res)
            return res

        threads = apply("_cores", self.resources["_cores"])
        resources["_cores"] = threads

        for name, res in self.resources.items():
            if name != "_cores":
                resources[name] = apply(name, res, threads=threads)
        resources = Resources(fromdict=resources)
        return resources
Beispiel #10
0
    def expand_resources(self, wildcards, input, attempt):
        resources = dict()

        def apply(name, res, threads=None):
            if callable(res):
                aux = dict(rulename=self.name)
                if threads is not None:
                    aux["threads"] = threads
                try:
                    res, _ = self.apply_input_function(
                        res,
                        wildcards,
                        input=input,
                        attempt=attempt,
                        incomplete_checkpoint_func=lambda e: 0,
                        raw_exceptions=True,
                        **aux)
                except (Exception, BaseException) as e:
                    raise InputFunctionException(e,
                                                 rule=self,
                                                 wildcards=wildcards)

            if isinstance(res, float):
                # round to integer
                res = int(round(res))

            if not isinstance(res, int) and not isinstance(res, str):
                raise WorkflowError(
                    "Resources function did not return int, float (floats are "
                    "rouded to the nearest integer), or str.",
                    rule=self,
                )
            if isinstance(res, int):
                global_res = self.workflow.global_resources.get(name, res)
                if global_res is not None:
                    res = min(global_res, res)
            return res

        threads = apply("_cores", self.resources["_cores"])
        if self.workflow.max_threads is not None:
            threads = min(threads, self.workflow.max_threads)
        resources["_cores"] = threads

        for name, res in self.resources.items():
            if name != "_cores":
                resources[name] = apply(name, res, threads=threads)
        resources = Resources(fromdict=resources)
        return resources
Beispiel #11
0
    def __init__(self, rule, dag, targetfile=None, format_wildcards=None):
        self.rule = rule
        self.dag = dag
        self.targetfile = targetfile

        self.wildcards_dict = self.rule.get_wildcards(targetfile)
        self.wildcards = Wildcards(fromdict=self.wildcards_dict)
        self._format_wildcards = (self.wildcards if format_wildcards is None
                                  else Wildcards(fromdict=format_wildcards))

        (self.input, self.output, self.params, self.log, self.benchmark,
         self.ruleio,
         self.dependencies) = rule.expand_wildcards(self.wildcards_dict)

        self.resources_dict = {
            name: min(self.rule.workflow.global_resources.get(name, res), res)
            for name, res in rule.resources.items()
        }
        self.threads = self.resources_dict["_cores"]
        self.resources = Resources(fromdict=self.resources_dict)
        self._inputsize = None

        self.dynamic_output, self.dynamic_input = set(), set()
        self.temp_output, self.protected_output = set(), set()
        self.touch_output = set()
        self.subworkflow_input = dict()
        for f in self.output:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_output:
                self.dynamic_output.add(f)
            if f_ in self.rule.temp_output:
                self.temp_output.add(f)
            if f_ in self.rule.protected_output:
                self.protected_output.add(f)
            if f_ in self.rule.touch_output:
                self.touch_output.add(f)
        for f in self.input:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_input:
                self.dynamic_input.add(f)
            if f_ in self.rule.subworkflow_input:
                self.subworkflow_input[f] = self.rule.subworkflow_input[f_]
        self._hash = self.rule.__hash__()
        if True or not self.dynamic_output:
            for o in self.output:
                self._hash ^= o.__hash__()
Beispiel #12
0
 def resources(self):
     if self._resources is None:
         self._resources = defaultdict(int)
         # take the maximum over all jobs
         pipe_group = any([
             any([is_flagged(o, "pipe") for o in job.output])
             for job in self.jobs
         ])
         for job in self.jobs:
             for res, value in job.resources.items():
                 if self.dag.workflow.run_local or pipe_group:
                     # in case of local execution, this must be a
                     # group of jobs that are connected with pipes
                     # and have to run simultaneously
                     self._resources[res] += value
                 else:
                     self._resources[res] = max(
                         self._resources.get(res, value), value)
     return Resources(fromdict=self._resources)
Beispiel #13
0
    def resources(self):
        if self._resources is None:

            def check_string_resource(res, value1, value2):
                if value1 != value2:
                    raise WorkflowError(
                        "Failed to group jobs together. Resource {} "
                        "is a string but not all group jobs require the same value. "
                        "Observed: {} != {}.".format(res, value1, value2))

            self._resources = defaultdict(int)
            self._resources["_nodes"] = 1
            pipe_group = any([job.is_pipe for job in self.jobs])
            # iterate over siblings that can be executed in parallel
            for siblings in self.toposorted:
                sibling_resources = defaultdict(int)
                for job in siblings:
                    try:
                        job_resources = job.resources
                    except FileNotFoundError:
                        # Skip job if resource evaluation leads to a file not found error.
                        # This will be caused by an inner job, which needs files created by the same group.
                        # All we can do is to ignore such jobs for now.
                        continue
                    for res, value in job_resources.items():
                        if isinstance(value, int):
                            if res != "_nodes":
                                sibling_resources[res] += value
                        elif isinstance(value, TBDString):
                            # we omit TBDs
                            continue
                        else:
                            # all string resources must be the same for all group jobs
                            if res in sibling_resources:
                                check_string_resource(res,
                                                      sibling_resources[res],
                                                      value)
                            else:
                                sibling_resources[res] = value

                for res, value in sibling_resources.items():
                    if isinstance(value, int):
                        if res != "_nodes":
                            if self.dag.workflow.run_local or pipe_group:
                                # in case of local execution, this must be a
                                # group of jobs that are connected with pipes
                                # and have to run simultaneously
                                self._resources[res] += value
                            else:
                                # take the maximum with previous values
                                self._resources[res] = max(
                                    self._resources.get(res, 0), value)
                    elif isinstance(value, TBDString):
                        # we omit TBDs
                        continue
                    else:
                        # all string resources must be the same for all group jobs
                        if res in self._resources:
                            check_string_resource(res, self._resources[res],
                                                  value)
                        else:
                            self._resources[res] = value

        return Resources(fromdict=self._resources)
Beispiel #14
0
class Job:
    HIGHEST_PRIORITY = sys.maxsize

    def __init__(self, rule, dag, targetfile=None, format_wildcards=None):
        self.rule = rule
        self.dag = dag
        self.targetfile = targetfile

        self.wildcards_dict = self.rule.get_wildcards(targetfile)
        self.wildcards = Wildcards(fromdict=self.wildcards_dict)
        self._format_wildcards = (self.wildcards if format_wildcards is None
                                  else Wildcards(fromdict=format_wildcards))

        (self.input, self.output, self.params, self.log, self.benchmark,
         self.ruleio,
         self.dependencies) = rule.expand_wildcards(self.wildcards_dict)

        self.resources_dict = {
            name: min(self.rule.workflow.global_resources.get(name, res), res)
            for name, res in rule.resources.items()
        }
        self.threads = self.resources_dict["_cores"]
        self.resources = Resources(fromdict=self.resources_dict)
        self._inputsize = None

        self.dynamic_output, self.dynamic_input = set(), set()
        self.temp_output, self.protected_output = set(), set()
        self.touch_output = set()
        self.subworkflow_input = dict()
        for f in self.output:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_output:
                self.dynamic_output.add(f)
            if f_ in self.rule.temp_output:
                self.temp_output.add(f)
            if f_ in self.rule.protected_output:
                self.protected_output.add(f)
            if f_ in self.rule.touch_output:
                self.touch_output.add(f)
        for f in self.input:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_input:
                self.dynamic_input.add(f)
            if f_ in self.rule.subworkflow_input:
                self.subworkflow_input[f] = self.rule.subworkflow_input[f_]
        self._hash = self.rule.__hash__()
        if True or not self.dynamic_output:
            for o in self.output:
                self._hash ^= o.__hash__()

    @property
    def priority(self):
        return self.dag.priority(self)

    @property
    def b64id(self):
        return base64.b64encode(
            (self.rule.name +
             "".join(self.output)).encode("utf-8")).decode("utf-8")

    @property
    def inputsize(self):
        """
        Return the size of the input files.
        Input files need to be present.
        """
        if self._inputsize is None:
            self._inputsize = sum(f.size for f in self.input)
        return self._inputsize

    @property
    def message(self):
        """ Return the message for this job. """
        try:
            return (self.format_wildcards(self.rule.message)
                    if self.rule.message else None)
        except AttributeError as ex:
            raise RuleException(str(ex), rule=self.rule)
        except KeyError as ex:
            raise RuleException("Unknown variable in message "
                                "of shell command: {}".format(str(ex)),
                                rule=self.rule)

    @property
    def shellcmd(self):
        """ Return the shell command. """
        try:
            return (self.format_wildcards(self.rule.shellcmd)
                    if self.rule.shellcmd else None)
        except AttributeError as ex:
            raise RuleException(str(ex), rule=self.rule)
        except KeyError as ex:
            raise RuleException("Unknown variable when printing "
                                "shell command: {}".format(str(ex)),
                                rule=self.rule)

    @property
    def expanded_output(self):
        """ Iterate over output files while dynamic output is expanded. """
        for f, f_ in zip(self.output, self.rule.output):
            if f in self.dynamic_output:
                expansion = self.expand_dynamic(
                    f_,
                    restriction=self.wildcards,
                    omit_value=_IOFile.dynamic_fill)
                if not expansion:
                    yield f_
                for f, _ in expansion:
                    yield IOFile(f, self.rule)
            else:
                yield f

    @property
    def dynamic_wildcards(self):
        """ Return all wildcard values determined from dynamic output. """
        combinations = set()
        for f, f_ in zip(self.output, self.rule.output):
            if f in self.dynamic_output:
                for f, w in self.expand_dynamic(
                        f_,
                        restriction=self.wildcards,
                        omit_value=_IOFile.dynamic_fill):
                    combinations.add(tuple(w.items()))
        wildcards = defaultdict(list)
        for combination in combinations:
            for name, value in combination:
                wildcards[name].append(value)
        return wildcards

    @property
    def missing_input(self):
        """ Return missing input files. """
        # omit file if it comes from a subworkflow
        return set(f for f in self.input
                   if not f.exists and not f in self.subworkflow_input)

    @property
    def output_mintime(self):
        """ Return oldest output file. """
        existing = [f.mtime for f in self.expanded_output if f.exists]
        if self.benchmark and self.benchmark.exists:
            existing.append(self.benchmark.mtime)
        if existing:
            return min(existing)
        return None

    @property
    def input_maxtime(self):
        """ Return newest input file. """
        existing = [f.mtime for f in self.input if f.exists]
        if existing:
            return max(existing)
        return None

    def missing_output(self, requested=None):
        """ Return missing output files. """
        files = set()
        if self.benchmark and (requested is None
                               or self.benchmark in requested):
            if not self.benchmark.exists:
                files.add(self.benchmark)

        for f, f_ in zip(self.output, self.rule.output):
            if requested is None or f in requested:
                if f in self.dynamic_output:
                    if not self.expand_dynamic(
                            f_,
                            restriction=self.wildcards,
                            omit_value=_IOFile.dynamic_fill):
                        files.add("{} (dynamic)".format(f_))
                elif not f.exists:
                    files.add(f)
        return files

    @property
    def existing_output(self):
        return filter(lambda f: f.exists, self.expanded_output)

    def check_protected_output(self):
        protected = list(filter(lambda f: f.protected, self.expanded_output))
        if protected:
            raise ProtectedOutputException(self.rule, protected)

    def prepare(self):
        """
        Prepare execution of job.
        This includes creation of directories and deletion of previously
        created dynamic files.
        """

        self.check_protected_output()

        unexpected_output = self.dag.reason(self).missing_output.intersection(
            self.existing_output)
        if unexpected_output:
            logger.warning(
                "Warning: the following output files of rule {} were not "
                "present when the DAG was created:\n{}".format(
                    self.rule, unexpected_output))

        if self.dynamic_output:
            for f, _ in chain(*map(
                    partial(self.expand_dynamic,
                            restriction=self.wildcards,
                            omit_value=_IOFile.dynamic_fill),
                    self.rule.dynamic_output)):
                os.remove(f)
        for f, f_ in zip(self.output, self.rule.output):
            f.prepare()
        for f in self.log:
            f.prepare()
        if self.benchmark:
            self.benchmark.prepare()

    def cleanup(self):
        """ Cleanup output files. """
        to_remove = [f for f in self.expanded_output if f.exists]
        if to_remove:
            logger.info("Removing output files of failed job {}"
                        " since they might be corrupted:\n{}".format(
                            self, ", ".join(to_remove)))
            for f in to_remove:
                f.remove()

    def format_wildcards(self, string, **variables):
        """ Format a string with variables from the job. """
        _variables = dict()
        _variables.update(self.rule.workflow.globals)
        _variables.update(
            dict(
                input=self.input,
                output=self.output,
                params=self.params,
                wildcards=self._format_wildcards,
                threads=self.threads,
                resources=self.resources,
                log=self.log,
                version=self.rule.version,
                rule=self.rule.name,
            ))
        _variables.update(variables)
        try:
            return format(string, **_variables)
        except NameError as ex:
            raise RuleException("NameError: " + str(ex), rule=self.rule)
        except IndexError as ex:
            raise RuleException("IndexError: " + str(ex), rule=self.rule)

    def properties(self, omit_resources="_cores _nodes".split()):
        resources = {
            name: res
            for name, res in self.resources.items()
            if name not in omit_resources
        }
        params = {name: value for name, value in self.params.items()}
        properties = {
            "rule": self.rule.name,
            "local": self.dag.workflow.is_local(self.rule),
            "input": self.input,
            "output": self.output,
            "params": params,
            "threads": self.threads,
            "resources": resources
        }
        return properties

    def json(self):
        return json.dumps(self.properties())

    def __repr__(self):
        return self.rule.name

    def __eq__(self, other):
        if other is None:
            return False
        return self.rule == other.rule and (
            self.dynamic_output or self.wildcards_dict == other.wildcards_dict)

    def __lt__(self, other):
        return self.rule.__lt__(other.rule)

    def __gt__(self, other):
        return self.rule.__gt__(other.rule)

    def __hash__(self):
        return self._hash

    @staticmethod
    def expand_dynamic(pattern, restriction=None, omit_value=None):
        """ Expand dynamic files. """
        return list(
            listfiles(pattern, restriction=restriction, omit_value=omit_value))
Beispiel #15
0
class Job:
    HIGHEST_PRIORITY = sys.maxsize

    def __init__(self, rule, dag, targetfile=None, format_wildcards=None):
        self.rule = rule
        self.dag = dag
        self.targetfile = targetfile

        self.wildcards_dict = self.rule.get_wildcards(targetfile)
        self.wildcards = Wildcards(fromdict=self.wildcards_dict)
        self._format_wildcards = (self.wildcards if format_wildcards is None
                                  else Wildcards(fromdict=format_wildcards))

        (self.input, self.output, self.params, self.log, self.benchmark,
         self.ruleio,
         self.dependencies) = rule.expand_wildcards(self.wildcards_dict)

        self.resources_dict = {}
        for name, res in rule.resources.items():
            if callable(res):
                res = res(self.wildcards)
                if not isinstance(res, int):
                    raise ValueError("Callable for resources must return int")
            self.resources_dict[name] = min(
                self.rule.workflow.global_resources.get(name, res), res)

        self.threads = self.resources_dict["_cores"]
        self.resources = Resources(fromdict=self.resources_dict)
        self.shadow_dir = None
        self._inputsize = None

        self.dynamic_output, self.dynamic_input = set(), set()
        self.temp_output, self.protected_output = set(), set()
        self.touch_output = set()
        self.subworkflow_input = dict()
        for f in self.output:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_output:
                self.dynamic_output.add(f)
            if f_ in self.rule.temp_output:
                self.temp_output.add(f)
            if f_ in self.rule.protected_output:
                self.protected_output.add(f)
            if f_ in self.rule.touch_output:
                self.touch_output.add(f)
        for f in self.input:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_input:
                self.dynamic_input.add(f)
            if f_ in self.rule.subworkflow_input:
                self.subworkflow_input[f] = self.rule.subworkflow_input[f_]
        self._hash = self.rule.__hash__()
        if True or not self.dynamic_output:
            for o in self.output:
                self._hash ^= o.__hash__()

    @property
    def is_shadow(self):
        return self.rule.shadow_depth is not None

    @property
    def priority(self):
        return self.dag.priority(self)

    @property
    def b64id(self):
        return base64.b64encode((self.rule.name + "".join(self.output)).encode(
            "utf-8")).decode("utf-8")

    @property
    def inputsize(self):
        """
        Return the size of the input files.
        Input files need to be present.
        """
        if self._inputsize is None:
            self._inputsize = sum(f.size for f in self.input)
        return self._inputsize

    @property
    def message(self):
        """ Return the message for this job. """
        try:
            return (self.format_wildcards(self.rule.message) if
                    self.rule.message else None)
        except AttributeError as ex:
            raise RuleException(str(ex), rule=self.rule)
        except KeyError as ex:
            raise RuleException("Unknown variable in message "
                                "of shell command: {}".format(str(ex)),
                                rule=self.rule)

    @property
    def shellcmd(self):
        """ Return the shell command. """
        try:
            return (self.format_wildcards(self.rule.shellcmd) if
                    self.rule.shellcmd else None)
        except AttributeError as ex:
            raise RuleException(str(ex), rule=self.rule)
        except KeyError as ex:
            raise RuleException("Unknown variable when printing "
                                "shell command: {}".format(str(ex)),
                                rule=self.rule)

    @property
    def expanded_output(self):
        """ Iterate over output files while dynamic output is expanded. """
        for f, f_ in zip(self.output, self.rule.output):
            if f in self.dynamic_output:
                expansion = self.expand_dynamic(f_)
                if not expansion:
                    yield f_
                for f, _ in expansion:
                    file_to_yield = IOFile(f, self.rule)
                    file_to_yield.clone_flags(f_)
                    yield file_to_yield
            else:
                yield f

    def shadowed_path(self, f):
        """ Get the shadowed path of IOFile f. """
        if not self.shadow_dir:
            return f
        f_ = IOFile(os.path.join(self.shadow_dir, f), self.rule)
        f_.clone_flags(f)
        return f_

    @property
    def dynamic_wildcards(self):
        """ Return all wildcard values determined from dynamic output. """
        combinations = set()
        for f, f_ in zip(self.output, self.rule.output):
            if f in self.dynamic_output:
                for f, w in self.expand_dynamic(f_):
                    combinations.add(tuple(w.items()))
        wildcards = defaultdict(list)
        for combination in combinations:
            for name, value in combination:
                wildcards[name].append(value)
        return wildcards

    @property
    def missing_input(self):
        """ Return missing input files. """
        # omit file if it comes from a subworkflow
        return set(f
                   for f in self.input
                   if not f.exists and not f in self.subworkflow_input)

    @property
    def existing_remote_input(self):
        files = set()

        for f in self.input:
            if f.is_remote:
                if f.exists_remote:
                    files.add(f)
        return files

    @property
    def existing_remote_output(self):
        files = set()

        for f in self.remote_output:
            if f.exists_remote:
                files.add(f)
        return files

    @property
    def missing_remote_input(self):
        return self.remote_input - self.existing_remote_input

    @property
    def missing_remote_output(self):
        return self.remote_output - self.existing_remote_output

    @property
    def output_mintime(self):
        """ Return oldest output file. """
        existing = [f.mtime for f in self.expanded_output if f.exists]
        if self.benchmark and self.benchmark.exists:
            existing.append(self.benchmark.mtime)
        if existing:
            return min(existing)
        return None

    @property
    def output_mintime_local(self):
        existing = [f.mtime_local for f in self.expanded_output if f.exists]
        if self.benchmark and self.benchmark.exists:
            existing.append(self.benchmark.mtime_local)
        if existing:
            return min(existing)
        return None

    @property
    def input_maxtime(self):
        """ Return newest input file. """
        existing = [f.mtime for f in self.input if f.exists]
        if existing:
            return max(existing)
        return None

    def missing_output(self, requested=None):
        """ Return missing output files. """
        files = set()
        if self.benchmark and (requested is None or
                               self.benchmark in requested):
            if not self.benchmark.exists:
                files.add(self.benchmark)

        for f, f_ in zip(self.output, self.rule.output):
            if requested is None or f in requested:
                if f in self.dynamic_output:
                    if not self.expand_dynamic(f_):
                        files.add("{} (dynamic)".format(f_))
                elif not f.exists:
                    files.add(f)
        return files

    @property
    def local_input(self):
        for f in self.input:
            if not f.is_remote:
                yield f

    @property
    def local_output(self):
        for f in self.output:
            if not f.is_remote:
                yield f

    @property
    def remote_input(self):
        for f in self.input:
            if f.is_remote:
                yield f

    @property
    def remote_output(self):
        for f in self.output:
            if f.is_remote:
                yield f

    @property
    def remote_input_newer_than_local(self):
        files = set()
        for f in self.remote_input:
            if (f.exists_remote and f.exists_local) and (
                    f.mtime > f.mtime_local):
                files.add(f)
        return files

    @property
    def remote_input_older_than_local(self):
        files = set()
        for f in self.remote_input:
            if (f.exists_remote and f.exists_local) and (
                    f.mtime < f.mtime_local):
                files.add(f)
        return files

    @property
    def remote_output_newer_than_local(self):
        files = set()
        for f in self.remote_output:
            if (f.exists_remote and f.exists_local) and (
                    f.mtime > f.mtime_local):
                files.add(f)
        return files

    @property
    def remote_output_older_than_local(self):
        files = set()
        for f in self.remote_output:
            if (f.exists_remote and f.exists_local) and (
                    f.mtime < f.mtime_local):
                files.add(f)
        return files

    @property
    def files_to_download(self):
        toDownload = set()

        for f in self.input:
            if f.is_remote:
                if not f.exists_local and f.exists_remote:
                    toDownload.add(f)

        toDownload = toDownload | self.remote_input_newer_than_local
        return toDownload

    @property
    def files_to_upload(self):
        return self.missing_remote_input & self.remote_input_older_than_local

    @property
    def existing_output(self):
        return filter(lambda f: f.exists, self.expanded_output)

    def check_protected_output(self):
        protected = list(filter(lambda f: f.protected, self.expanded_output))
        if protected:
            raise ProtectedOutputException(self.rule, protected)

    def remove_existing_output(self):
        """Clean up both dynamic and regular output before rules actually run
        """
        if self.dynamic_output:
            for f, _ in chain(*map(self.expand_dynamic,
                                   self.rule.dynamic_output)):
                os.remove(f)

        for f, f_ in zip(self.output, self.rule.output):
            try:
                f.remove(remove_non_empty_dir=False)
            except FileNotFoundError:
                #No file == no problem
                pass

    def prepare(self):
        """
        Prepare execution of job.
        This includes creation of directories and deletion of previously
        created dynamic files.
        Creates a shadow directory for the job if specified.
        """

        self.check_protected_output()

        unexpected_output = self.dag.reason(self).missing_output.intersection(
            self.existing_output)
        if unexpected_output:
            logger.warning(
                "Warning: the following output files of rule {} were not "
                "present when the DAG was created:\n{}".format(
                    self.rule, unexpected_output))

        for f, f_ in zip(self.output, self.rule.output):
            f.prepare()

        for f in self.files_to_download:
            f.download_from_remote()

        for f in self.log:
            f.prepare()
        if self.benchmark:
            self.benchmark.prepare()

        self.remove_existing_output()

        if not self.is_shadow:
            return
        # Create shadow directory structure
        self.shadow_dir = tempfile.mkdtemp(
            dir=self.rule.workflow.persistence.shadow_path)
        cwd = os.getcwd()
        # Shallow simply symlink everything in the working directory.
        if self.rule.shadow_depth == "shallow":
            for source in os.listdir(cwd):
                link = os.path.join(self.shadow_dir, source)
                os.symlink(os.path.abspath(source), link)
        elif self.rule.shadow_depth == "full":
            snakemake_dir = os.path.join(cwd, ".snakemake")
            for dirpath, dirnames, filenames in os.walk(cwd):
                # Must exclude .snakemake and its children to avoid infinite
                # loop of symlinks.
                if os.path.commonprefix([snakemake_dir, dirpath
                                         ]) == snakemake_dir:
                    continue
                for dirname in dirnames:
                    if dirname == ".snakemake":
                        continue
                    relative_source = os.path.relpath(os.path.join(dirpath,
                                                                   dirname))
                    shadow = os.path.join(self.shadow_dir, relative_source)
                    os.mkdir(shadow)

                for filename in filenames:
                    source = os.path.join(dirpath, filename)
                    relative_source = os.path.relpath(source)
                    link = os.path.join(self.shadow_dir, relative_source)
                    os.symlink(source, link)

    def cleanup(self):
        """ Cleanup output files. """
        to_remove = [f for f in self.expanded_output if f.exists]

        to_remove.extend([f for f in self.remote_input if f.exists])
        to_remove.extend([f for f in self.remote_output if f.exists_local])
        if to_remove:
            logger.info("Removing output files of failed job {}"
                        " since they might be corrupted:\n{}".format(
                            self, ", ".join(to_remove)))
            for f in to_remove:
                f.remove()

            self.rmdir_empty_remote_dirs()

    @property
    def empty_remote_dirs(self):
        for f in (set(self.output) | set(self.input)):
            if f.is_remote:
                if os.path.exists(os.path.dirname(f)) and not len(os.listdir(
                        os.path.dirname(f))):
                    yield os.path.dirname(f)

    def rmdir_empty_remote_dirs(self):
        for d in self.empty_remote_dirs:
            try:
                os.removedirs(d)
            except:
                pass  # it's ok if we can't remove the leaf

    def format_wildcards(self, string, **variables):
        """ Format a string with variables from the job. """
        _variables = dict()
        _variables.update(self.rule.workflow.globals)
        _variables.update(dict(input=self.input,
                               output=self.output,
                               params=self.params,
                               wildcards=self._format_wildcards,
                               threads=self.threads,
                               resources=self.resources,
                               log=self.log,
                               version=self.rule.version,
                               rule=self.rule.name, ))
        _variables.update(variables)
        try:
            return format(string, **_variables)
        except NameError as ex:
            raise RuleException("NameError: " + str(ex), rule=self.rule)
        except IndexError as ex:
            raise RuleException("IndexError: " + str(ex), rule=self.rule)

    def properties(self,
                   omit_resources="_cores _nodes".split(),
                   **aux_properties):
        resources = {
            name: res
            for name, res in self.resources.items()
            if name not in omit_resources
        }
        params = {name: value for name, value in self.params.items()}
        properties = {
            "rule": self.rule.name,
            "local": self.dag.workflow.is_local(self.rule),
            "input": self.input,
            "output": self.output,
            "params": params,
            "threads": self.threads,
            "resources": resources,
        }
        properties.update(aux_properties)
        return properties

    def __repr__(self):
        return self.rule.name

    def __eq__(self, other):
        if other is None:
            return False
        return (self.rule == other.rule and
                (self.dynamic_output or
                 self.wildcards_dict == other.wildcards_dict) and
                (self.dynamic_input or self.input == other.input))

    def __lt__(self, other):
        return self.rule.__lt__(other.rule)

    def __gt__(self, other):
        return self.rule.__gt__(other.rule)

    def __hash__(self):
        return self._hash

    def expand_dynamic(self, pattern):
        """ Expand dynamic files. """
        return list(listfiles(pattern,
                              restriction=self.wildcards,
                              omit_value=DYNAMIC_FILL))
Beispiel #16
0
class Job:
    HIGHEST_PRIORITY = sys.maxsize

    def __init__(self, rule, dag, targetfile=None, format_wildcards=None):
        self.rule = rule
        self.dag = dag
        self.targetfile = targetfile

        self.wildcards_dict = self.rule.get_wildcards(targetfile)
        self.wildcards = Wildcards(fromdict=self.wildcards_dict)
        self._format_wildcards = (self.wildcards if format_wildcards is None
                                  else Wildcards(fromdict=format_wildcards))

        (self.input, self.output, self.params, self.log, self.benchmark,
         self.ruleio,
         self.dependencies) = rule.expand_wildcards(self.wildcards_dict)

        self.resources_dict = {
            name: min(self.rule.workflow.global_resources.get(name, res), res)
            for name, res in rule.resources.items()
        }
        self.threads = self.resources_dict["_cores"]
        self.resources = Resources(fromdict=self.resources_dict)
        self._inputsize = None

        self.dynamic_output, self.dynamic_input = set(), set()
        self.temp_output, self.protected_output = set(), set()
        self.touch_output = set()
        self.subworkflow_input = dict()
        for f in self.output:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_output:
                self.dynamic_output.add(f)
            if f_ in self.rule.temp_output:
                self.temp_output.add(f)
            if f_ in self.rule.protected_output:
                self.protected_output.add(f)
            if f_ in self.rule.touch_output:
                self.touch_output.add(f)
        for f in self.input:
            f_ = self.ruleio[f]
            if f_ in self.rule.dynamic_input:
                self.dynamic_input.add(f)
            if f_ in self.rule.subworkflow_input:
                self.subworkflow_input[f] = self.rule.subworkflow_input[f_]
        self._hash = self.rule.__hash__()
        if True or not self.dynamic_output:
            for o in self.output:
                self._hash ^= o.__hash__()

    @property
    def priority(self):
        return self.dag.priority(self)

    @property
    def b64id(self):
        return base64.b64encode((self.rule.name + "".join(self.output)
                                 ).encode("utf-8")).decode("utf-8")

    @property
    def inputsize(self):
        """
        Return the size of the input files.
        Input files need to be present.
        """
        if self._inputsize is None:
            self._inputsize = sum(f.size for f in self.input)
        return self._inputsize

    @property
    def message(self):
        """ Return the message for this job. """
        try:
            return (self.format_wildcards(self.rule.message) if
                    self.rule.message else None)
        except AttributeError as ex:
            raise RuleException(str(ex), rule=self.rule)
        except KeyError as ex:
            raise RuleException("Unknown variable in message "
                                "of shell command: {}".format(str(ex)),
                                rule=self.rule)

    @property
    def shellcmd(self):
        """ Return the shell command. """
        try:
            return (self.format_wildcards(self.rule.shellcmd) if
                    self.rule.shellcmd else None)
        except AttributeError as ex:
            raise RuleException(str(ex), rule=self.rule)
        except KeyError as ex:
            raise RuleException("Unknown variable when printing "
                                "shell command: {}".format(str(ex)),
                                rule=self.rule)

    @property
    def expanded_output(self):
        """ Iterate over output files while dynamic output is expanded. """
        for f, f_ in zip(self.output, self.rule.output):
            if f in self.dynamic_output:
                expansion = self.expand_dynamic(
                    f_,
                    restriction=self.wildcards,
                    omit_value=_IOFile.dynamic_fill)
                if not expansion:
                    yield f_
                for f, _ in expansion:
                    yield IOFile(f, self.rule)
            else:
                yield f

    @property
    def dynamic_wildcards(self):
        """ Return all wildcard values determined from dynamic output. """
        combinations = set()
        for f, f_ in zip(self.output, self.rule.output):
            if f in self.dynamic_output:
                for f, w in self.expand_dynamic(
                    f_,
                    restriction=self.wildcards,
                    omit_value=_IOFile.dynamic_fill):
                    combinations.add(tuple(w.items()))
        wildcards = defaultdict(list)
        for combination in combinations:
            for name, value in combination:
                wildcards[name].append(value)
        return wildcards

    @property
    def missing_input(self):
        """ Return missing input files. """
        # omit file if it comes from a subworkflow
        return set(f for f in self.input
                   if not f.exists and not f in self.subworkflow_input)

    @property
    def output_mintime(self):
        """ Return oldest output file. """
        existing = [f.mtime for f in self.expanded_output if f.exists]
        if self.benchmark and self.benchmark.exists:
            existing.append(self.benchmark.mtime)
        if existing:
            return min(existing)
        return None

    @property
    def input_maxtime(self):
        """ Return newest input file. """
        existing = [f.mtime for f in self.input if f.exists]
        if existing:
            return max(existing)
        return None

    def missing_output(self, requested=None):
        """ Return missing output files. """
        files = set()
        if self.benchmark and (requested is None or
                               self.benchmark in requested):
            if not self.benchmark.exists:
                files.add(self.benchmark)

        for f, f_ in zip(self.output, self.rule.output):
            if requested is None or f in requested:
                if f in self.dynamic_output:
                    if not self.expand_dynamic(
                        f_,
                        restriction=self.wildcards,
                        omit_value=_IOFile.dynamic_fill):
                        files.add("{} (dynamic)".format(f_))
                elif not f.exists:
                    files.add(f)
        return files

    @property
    def existing_output(self):
        return filter(lambda f: f.exists, self.expanded_output)

    def check_protected_output(self):
        protected = list(filter(lambda f: f.protected, self.expanded_output))
        if protected:
            raise ProtectedOutputException(self.rule, protected)

    def prepare(self):
        """
        Prepare execution of job.
        This includes creation of directories and deletion of previously
        created dynamic files.
        """

        self.check_protected_output()

        unexpected_output = self.dag.reason(self).missing_output.intersection(
            self.existing_output)
        if unexpected_output:
            logger.warning(
                "Warning: the following output files of rule {} were not "
                "present when the DAG was created:\n{}".format(
                    self.rule, unexpected_output))

        if self.dynamic_output:
            for f, _ in chain(*map(partial(self.expand_dynamic,
                                           restriction=self.wildcards,
                                           omit_value=_IOFile.dynamic_fill),
                                   self.rule.dynamic_output)):
                os.remove(f)
        for f, f_ in zip(self.output, self.rule.output):
            f.prepare()
        for f in self.log:
            f.prepare()
        if self.benchmark:
            self.benchmark.prepare()

    def cleanup(self):
        """ Cleanup output files. """
        to_remove = [f for f in self.expanded_output if f.exists]
        if to_remove:
            logger.info("Removing output files of failed job {}"
                        " since they might be corrupted:\n{}".format(
                            self, ", ".join(to_remove)))
            for f in to_remove:
                f.remove()

    def format_wildcards(self, string, **variables):
        """ Format a string with variables from the job. """
        _variables = dict()
        _variables.update(self.rule.workflow.globals)
        _variables.update(dict(input=self.input,
                               output=self.output,
                               params=self.params,
                               wildcards=self._format_wildcards,
                               threads=self.threads,
                               resources=self.resources,
                               log=self.log,
                               version=self.rule.version,
                               rule=self.rule.name, ))
        _variables.update(variables)
        try:
            return format(string, **_variables)
        except NameError as ex:
            raise RuleException("NameError: " + str(ex), rule=self.rule)
        except IndexError as ex:
            raise RuleException("IndexError: " + str(ex), rule=self.rule)

    def properties(self, omit_resources="_cores _nodes".split()):
        resources = {
            name: res
            for name, res in self.resources.items()
            if name not in omit_resources
        }
        params = {name: value for name, value in self.params.items()}
        properties = {
            "rule": self.rule.name,
            "local": self.dag.workflow.is_local(self.rule),
            "input": self.input,
            "output": self.output,
            "params": params,
            "threads": self.threads,
            "resources": resources
        }
        return properties

    def json(self):
        return json.dumps(self.properties())

    def __repr__(self):
        return self.rule.name

    def __eq__(self, other):
        if other is None:
            return False
        return self.rule == other.rule and (
            self.dynamic_output or self.wildcards_dict == other.wildcards_dict)

    def __lt__(self, other):
        return self.rule.__lt__(other.rule)

    def __gt__(self, other):
        return self.rule.__gt__(other.rule)

    def __hash__(self):
        return self._hash

    @staticmethod
    def expand_dynamic(pattern, restriction=None, omit_value=None):
        """ Expand dynamic files. """
        return list(listfiles(pattern,
                              restriction=restriction,
                              omit_value=omit_value))