Ejemplo n.º 1
0
class DAG:
    """Directed acyclic graph of jobs."""
    def __init__(self,
                 workflow,
                 rules=None,
                 dryrun=False,
                 targetfiles=None,
                 targetrules=None,
                 forceall=False,
                 forcerules=None,
                 forcefiles=None,
                 priorityfiles=None,
                 priorityrules=None,
                 untilfiles=None,
                 untilrules=None,
                 omitfiles=None,
                 omitrules=None,
                 ignore_ambiguity=False,
                 force_incomplete=False,
                 ignore_incomplete=False,
                 notemp=False,
                 keep_remote_local=False):

        self.dryrun = dryrun
        self.dependencies = defaultdict(partial(defaultdict, set))
        self.depending = defaultdict(partial(defaultdict, set))
        self._needrun = set()
        self._priority = dict()
        self._downstream_size = dict()
        self._temp_input_count = dict()
        self._reason = defaultdict(Reason)
        self._finished = set()
        self._dynamic = set()
        self._len = 0
        self.workflow = workflow
        self.rules = set(rules)
        self.ignore_ambiguity = ignore_ambiguity
        self.targetfiles = targetfiles
        self.targetrules = targetrules
        self.priorityfiles = priorityfiles
        self.priorityrules = priorityrules
        self.targetjobs = set()
        self.prioritytargetjobs = set()
        self._ready_jobs = set()
        self.notemp = notemp
        self.keep_remote_local = keep_remote_local
        self._jobid = dict()

        self.forcerules = set()
        self.forcefiles = set()
        self.untilrules = set()
        self.untilfiles = set()
        self.omitrules = set()
        self.omitfiles = set()
        self.updated_subworkflow_files = set()
        if forceall:
            self.forcerules.update(self.rules)
        elif forcerules:
            self.forcerules.update(forcerules)
        if forcefiles:
            self.forcefiles.update(forcefiles)
        if untilrules:  # keep only the rule names
            self.untilrules.update(set(rule.name for rule in untilrules))
        if untilfiles:
            self.untilfiles.update(untilfiles)
        if omitrules:
            self.omitrules.update(set(rule.name for rule in omitrules))
        if omitfiles:
            self.omitfiles.update(omitfiles)

        self.omitforce = set()

        self.force_incomplete = force_incomplete
        self.ignore_incomplete = ignore_incomplete

        self.periodic_wildcard_detector = PeriodicityDetector()

        self.update_output_index()

    def init(self):
        """ Initialise the DAG. """
        for job in map(self.rule2job, self.targetrules):
            job = self.update([job])
            self.targetjobs.add(job)

        for file in self.targetfiles:
            job = self.update(self.file2jobs(file), file=file)
            self.targetjobs.add(job)

        self.update_needrun()
        self.set_until_jobs()
        self.delete_omitfrom_jobs()
        # check if remaining jobs are valid
        for job in self.jobs:
            job.is_valid()

    def update_output_index(self):
        """Update the OutputIndex."""
        self.output_index = OutputIndex(self.rules)

    def check_incomplete(self):
        """Check if any output files are incomplete. This is done by looking up
        markers in the persistence module."""
        if not self.ignore_incomplete:
            incomplete = self.incomplete_files
            if incomplete:
                if self.force_incomplete:
                    logger.debug("Forcing incomplete files:")
                    logger.debug("\t" + "\n\t".join(incomplete))
                    self.forcefiles.update(incomplete)
                else:
                    raise IncompleteFilesException(incomplete)

    def check_dynamic(self):
        """Check dynamic output and update downstream rules if necessary."""
        for job in filter(
                lambda job: (job.dynamic_output and not self.needrun(job)),
                self.jobs):
            self.update_dynamic(job)

    @property
    def dynamic_output_jobs(self):
        """Iterate over all jobs with dynamic output files."""
        return (job for job in self.jobs if job.dynamic_output)

    @property
    def jobs(self):
        """ All jobs in the DAG. """
        for job in self.bfs(self.dependencies, *self.targetjobs):
            yield job

    @property
    def needrun_jobs(self):
        """ Jobs that need to be executed. """
        for job in filter(
                self.needrun,
                self.bfs(self.dependencies,
                         *self.targetjobs,
                         stop=self.noneedrun_finished)):
            yield job

    @property
    def local_needrun_jobs(self):
        """Iterate over all jobs that need to be run and are marked as local."""
        return filter(lambda job: self.workflow.is_local(job.rule),
                      self.needrun_jobs)

    @property
    def finished_jobs(self):
        """ Iterate over all jobs that have been finished."""
        for job in filter(self.finished,
                          self.bfs(self.dependencies, *self.targetjobs)):
            yield job

    @property
    def ready_jobs(self):
        """Jobs that are ready to execute."""
        return self._ready_jobs

    def ready(self, job):
        """Return whether a given job is ready to execute."""
        return job in self._ready_jobs

    def needrun(self, job):
        """Return whether a given job needs to be executed."""
        return job in self._needrun

    def priority(self, job):
        """Return priority of given job."""
        return self._priority[job]

    def downstream_size(self, job):
        """Return the number of downstream jobs of a given job."""
        return self._downstream_size[job]

    def temp_input_count(self, job):
        """Return number of temporary input files of given job."""
        return self._temp_input_count[job]

    def noneedrun_finished(self, job):
        """
        Return whether a given job is finished or was not
        required to run at all.
        """
        return not self.needrun(job) or self.finished(job)

    def reason(self, job):
        """ Return the reason of the job execution. """
        return self._reason[job]

    def finished(self, job):
        """ Return whether a job is finished. """
        return job in self._finished

    def dynamic(self, job):
        """
        Return whether a job is dynamic (i.e. it is only a placeholder
        for those that are created after the job with dynamic output has
        finished.
        """
        return job in self._dynamic

    def requested_files(self, job):
        """Return the files a job requests."""
        return set(*self.depending[job].values())

    @property
    def incomplete_files(self):
        """Return list of incomplete files."""
        return list(
            chain(*(job.output
                    for job in filter(self.workflow.persistence.incomplete,
                                      filterfalse(self.needrun, self.jobs)))))

    @property
    def newversion_files(self):
        """Return list of files where the current version is newer than the
        recorded version.
        """
        return list(
            chain(*(job.output for job in filter(
                self.workflow.persistence.newversion, self.jobs))))

    def missing_temp(self, job):
        """
        Return whether a temp file that is input of the given job is missing.
        """
        for job_, files in self.depending[job].items():
            if self.needrun(job_) and any(not f.exists for f in files):
                return True
        return False

    def check_and_touch_output(self, job, wait=3):
        """ Raise exception if output files of job are missing. """
        expanded_output = [
            job.shadowed_path(path) for path in job.expanded_output
        ]
        try:
            wait_for_files(expanded_output, latency_wait=wait)
        except IOError as e:
            raise MissingOutputException(str(e), rule=job.rule)

        #It is possible, due to archive expansion or cluster clock skew, that
        #the files appear older than the input.  But we know they must be new,
        #so touch them to update timestamps.
        #Note that if the input files somehow have a future date then this will
        #not currently be spotted and the job will always be re-run.
        #Also, don't touch directories, as we can't guarantee they were removed.
        for f in expanded_output:
            if not os.path.isdir(f):
                f.touch()

    def unshadow_output(self, job):
        """ Move files from shadow directory to real output paths. """
        if not job.shadow_dir or not job.expanded_output:
            return
        for real_output in chain(job.expanded_output, job.log):
            shadow_output = job.shadowed_path(real_output).file
            # Remake absolute symlinks as relative
            if os.path.islink(shadow_output):
                dest = os.readlink(shadow_output)
                if os.path.isabs(dest):
                    rel_dest = os.path.relpath(dest, job.shadow_dir)
                    os.remove(shadow_output)
                    os.symlink(rel_dest, shadow_output)

            if os.path.realpath(shadow_output) == os.path.realpath(
                    real_output):
                continue
            logger.info("Moving shadow output {} to destination {}".format(
                shadow_output, real_output))
            shutil.move(shadow_output, real_output)
        shutil.rmtree(job.shadow_dir)

    def check_periodic_wildcards(self, job):
        """ Raise an exception if a wildcard of the given job appears to be periodic,
        indicating a cyclic dependency. """
        for wildcard, value in job.wildcards_dict.items():
            periodic_substring = self.periodic_wildcard_detector.is_periodic(
                value)
            if periodic_substring is not None:
                raise PeriodicWildcardError(
                    "The value {} in wildcard {} is periodically repeated ({}). "
                    "This would lead to an infinite recursion. "
                    "To avoid this, e.g. restrict the wildcards in this rule to certain values."
                    .format(periodic_substring, wildcard, value),
                    rule=job.rule)

    def handle_protected(self, job):
        """ Write-protect output files that are marked with protected(). """
        for f in job.expanded_output:
            if f in job.protected_output:
                logger.info("Write-protecting output file {}.".format(f))
                f.protect()

    def handle_touch(self, job):
        """ Touches those output files that are marked for touching. """
        for f in job.expanded_output:
            if f in job.touch_output:
                f = job.shadowed_path(f)
                logger.info("Touching output file {}.".format(f))
                f.touch_or_create()

    def temp_input(self, job):
        for job_, files in self.dependencies[job].items():
            for f in filter(job_.temp_output.__contains__, files):
                yield f

    def handle_temp(self, job):
        """ Remove temp files if they are no longer needed. """
        if self.notemp:
            return

        needed = lambda job_, f: any(
            f in files for j, files in self.depending[job_].items()
            if not self.finished(j) and self.needrun(j) and j != job)

        def unneeded_files():
            for job_, files in self.dependencies[job].items():
                yield from filterfalse(partial(needed, job_),
                                       job_.temp_output & files)
            if job not in self.targetjobs:
                yield from filterfalse(partial(needed, job), job.temp_output)

        for f in unneeded_files():
            logger.info("Removing temporary output file {}.".format(f))
            f.remove(remove_non_empty_dir=True)

    def handle_remote(self, job, upload=True):
        """ Remove local files if they are no longer needed, and upload to S3. """
        if upload:
            # handle output files
            for f in job.expanded_output:
                if f.is_remote:
                    f.upload_to_remote()
                    remote_mtime = f.mtime
                    # immediately force local mtime to match remote,
                    # since conversions from S3 headers are not 100% reliable
                    # without this, newness comparisons may fail down the line
                    f.touch(times=(remote_mtime, remote_mtime))

                    if not f.exists_remote:
                        raise RemoteFileException(
                            "The file upload was attempted, but it does not "
                            "exist on remote. Check that your credentials have "
                            "read AND write permissions.")

        if not self.keep_remote_local:
            # handle input files
            needed = lambda job_, f: any(
                f in files for j, files in self.depending[job_].items()
                if not self.finished(j) and self.needrun(j) and j != job)

            def unneeded_files():
                putative = lambda f: f.is_remote and not f.protected and not f.should_keep_local
                generated_input = set()
                for job_, files in self.dependencies[job].items():
                    generated_input |= files
                    for f in filter(putative, files):
                        if not needed(job_, f):
                            yield f
                for f in filter(putative, job.output):
                    if not needed(job, f) and not f in self.targetfiles:
                        for f_ in job.expand_dynamic(f):
                            yield f
                for f in filter(putative, job.input):
                    # TODO what about remote inputs that are used by multiple jobs?
                    if f not in generated_input:
                        yield f

            for f in unneeded_files():
                logger.info("Removing local output file: {}".format(f))
                f.remove()

            job.rmdir_empty_remote_dirs()

    def jobid(self, job):
        """Return job id of given job."""
        if job not in self._jobid:
            self._jobid[job] = len(self._jobid)
        return self._jobid[job]

    def update(self, jobs, file=None, visited=None, skip_until_dynamic=False):
        """ Update the DAG by adding given jobs and their dependencies. """
        if visited is None:
            visited = set()
        producer = None
        exceptions = list()
        jobs = sorted(jobs, reverse=not self.ignore_ambiguity)
        cycles = list()

        for job in jobs:
            if file in job.input:
                cycles.append(job)
                continue
            if job in visited:
                cycles.append(job)
                continue
            try:
                self.check_periodic_wildcards(job)
                self.update_(job,
                             visited=set(visited),
                             skip_until_dynamic=skip_until_dynamic)
                # TODO this might fail if a rule discarded here is needed
                # elsewhere
                if producer:
                    if job < producer or self.ignore_ambiguity:
                        break
                    elif producer is not None:
                        raise AmbiguousRuleException(file, job, producer)
                producer = job
            except (MissingInputException, CyclicGraphException,
                    PeriodicWildcardError) as ex:
                exceptions.append(ex)
            except RecursionError as e:
                raise WorkflowError(
                    e, "If building the DAG exceeds the recursion limit, "
                    "this is likely due to a cyclic dependency."
                    "E.g. you might have a sequence of rules that "
                    "can generate their own input. Try to make "
                    "the output files more specific. "
                    "A common pattern is to have different prefixes "
                    "in the output files of different rules." +
                    "\nProblematic file pattern: {}".format(file)
                    if file else "")
        if producer is None:
            if cycles:
                job = cycles[0]
                raise CyclicGraphException(job.rule, file, rule=job.rule)
            if exceptions:
                raise exceptions[0]
        return producer

    def update_(self, job, visited=None, skip_until_dynamic=False):
        """ Update the DAG by adding the given job and its dependencies. """
        if job in self.dependencies:
            return
        if visited is None:
            visited = set()
        visited.add(job)
        dependencies = self.dependencies[job]
        potential_dependencies = self.collect_potential_dependencies(
            job).items()

        skip_until_dynamic = skip_until_dynamic and not job.dynamic_output

        missing_input = job.missing_input
        producer = dict()
        exceptions = dict()
        for file, jobs in potential_dependencies:
            try:
                producer[file] = self.update(
                    jobs,
                    file=file,
                    visited=visited,
                    skip_until_dynamic=skip_until_dynamic
                    or file in job.dynamic_input)
            except (MissingInputException, CyclicGraphException,
                    PeriodicWildcardError) as ex:
                if file in missing_input:
                    self.delete_job(job,
                                    recursive=False)  # delete job from tree
                    raise ex

        for file, job_ in producer.items():
            dependencies[job_].add(file)
            self.depending[job_][job].add(file)

        missing_input -= producer.keys()
        if missing_input:
            self.delete_job(job, recursive=False)  # delete job from tree
            raise MissingInputException(job.rule, missing_input)

        if skip_until_dynamic:
            self._dynamic.add(job)

    def update_needrun(self):
        """ Update the information whether a job needs to be executed. """
        def output_mintime(job):
            for job_ in self.bfs(self.depending, job):
                t = job_.output_mintime
                if t:
                    return t

        def needrun(job):
            reason = self.reason(job)
            noinitreason = not reason
            updated_subworkflow_input = self.updated_subworkflow_files.intersection(
                job.input)
            if (job not in self.omitforce and job.rule in self.forcerules
                    or not self.forcefiles.isdisjoint(job.output)):
                reason.forced = True
            elif updated_subworkflow_input:
                reason.updated_input.update(updated_subworkflow_input)
            elif job in self.targetjobs:
                # TODO find a way to handle added/removed input files here?
                if not job.output and not job.benchmark:
                    if job.input:
                        if job.rule.norun:
                            reason.updated_input_run.update(
                                [f for f in job.input if not f.exists])
                        else:
                            reason.nooutput = True
                    else:
                        reason.noio = True
                else:
                    if job.rule in self.targetrules:
                        missing_output = job.missing_output()
                    else:
                        missing_output = job.missing_output(
                            requested=set(chain(*self.depending[job].values()))
                            | self.targetfiles)
                    reason.missing_output.update(missing_output)
            if not reason:
                output_mintime_ = output_mintime(job)
                if output_mintime_:
                    updated_input = [
                        f for f in job.input
                        if f.exists and f.is_newer(output_mintime_)
                    ]
                    reason.updated_input.update(updated_input)
            if noinitreason and reason:
                reason.derived = False
            return job

        reason = self.reason
        _needrun = self._needrun
        dependencies = self.dependencies
        depending = self.depending

        _needrun.clear()
        candidates = set(self.jobs)

        queue = list(filter(reason, map(needrun, candidates)))
        visited = set(queue)
        while queue:
            job = queue.pop(0)
            _needrun.add(job)

            for job_, files in dependencies[job].items():
                missing_output = job_.missing_output(requested=files)
                reason(job_).missing_output.update(missing_output)
                if missing_output and not job_ in visited:
                    visited.add(job_)
                    queue.append(job_)

            for job_, files in depending[job].items():
                if job_ in candidates:
                    reason(job_).updated_input_run.update(files)
                    if not job_ in visited:
                        visited.add(job_)
                        queue.append(job_)

        self._len = len(_needrun)

    def in_until(self, job):
        """Return whether given job has been specified via --until."""
        return (job.rule.name in self.untilrules
                or not self.untilfiles.isdisjoint(job.output))

    def in_omitfrom(self, job):
        """Return whether given job has been specified via --omit-from."""
        return (job.rule.name in self.omitrules
                or not self.omitfiles.isdisjoint(job.output))

    def until_jobs(self):
        """Returns a generator of jobs specified by untiljobs."""
        return (job for job in self.jobs if self.in_until(job))

    def omitfrom_jobs(self):
        """Returns a generator of jobs specified by omitfromjobs."""
        return (job for job in self.jobs if self.in_omitfrom(job))

    def downstream_of_omitfrom(self):
        """Returns the downstream of --omit-from rules or files."""
        return filter(lambda job: not self.in_omitfrom(job),
                      self.bfs(self.depending, *self.omitfrom_jobs()))

    def delete_omitfrom_jobs(self):
        """Removes jobs downstream of jobs specified by --omit-from."""
        if not self.omitrules and not self.omitfiles:
            return
        downstream_jobs = list(self.downstream_of_omitfrom()
                               )  # need to cast as list before deleting jobs
        for job in downstream_jobs:
            self.delete_job(job, recursive=False, add_dependencies=True)

    def set_until_jobs(self):
        """Removes jobs downstream of jobs specified by --omit-from."""
        if not self.untilrules and not self.untilfiles:
            return
        self.targetjobs = set(self.until_jobs())

    def update_priority(self):
        """ Update job priorities. """
        prioritized = (lambda job: job.rule in self.priorityrules or not self.
                       priorityfiles.isdisjoint(job.output))
        for job in self.needrun_jobs:
            self._priority[job] = job.rule.priority
        for job in self.bfs(self.dependencies,
                            *filter(prioritized, self.needrun_jobs),
                            stop=self.noneedrun_finished):
            self._priority[job] = Job.HIGHEST_PRIORITY

    def update_ready(self):
        """ Update information whether a job is ready to execute. """
        for job in filter(self.needrun, self.jobs):
            if not self.finished(job) and self._ready(job):
                self._ready_jobs.add(job)

    def update_downstream_size(self):
        """For each job, update number of downstream jobs."""
        for job in self.needrun_jobs:
            self._downstream_size[job] = sum(1 for _ in self.bfs(
                self.depending, job, stop=self.noneedrun_finished)) - 1

    def update_temp_input_count(self):
        """For each job update the number of temporary input files."""
        for job in self.needrun_jobs:
            self._temp_input_count[job] = sum(1 for _ in self.temp_input(job))

    def close_remote_objects(self):
        """Close all remote objects."""
        for job in self.jobs:
            if not self.needrun(job):
                job.close_remote()

    def postprocess(self):
        """Postprocess the DAG. This has to be invoked after any change to the
        DAG topology."""
        self.update_needrun()
        self.update_priority()
        self.update_ready()
        self.update_downstream_size()
        self.update_temp_input_count()
        self.close_remote_objects()

    def _ready(self, job):
        """Return whether the given job is ready to execute."""
        return self._finished.issuperset(
            filter(self.needrun, self.dependencies[job]))

    def finish(self, job, update_dynamic=True):
        """Finish a given job (e.g. remove from ready jobs, mark depending jobs
        as ready)."""
        job.close_remote()

        self._finished.add(job)
        try:
            self._ready_jobs.remove(job)
        except KeyError:
            pass
        # mark depending jobs as ready
        for job_ in self.depending[job]:
            if self.needrun(job_) and self._ready(job_):
                self._ready_jobs.add(job_)

        if update_dynamic and job.dynamic_output:
            logger.info("Dynamically updating jobs")
            newjob = self.update_dynamic(job)
            if newjob:
                # simulate that this job ran and was finished before
                self.omitforce.add(newjob)
                self._needrun.add(newjob)
                self._finished.add(newjob)

                self.postprocess()
                self.handle_protected(newjob)
                self.handle_touch(newjob)
                # add finished jobs to len as they are not counted after new postprocess
                self._len += len(self._finished)

    def update_dynamic(self, job):
        """Update the DAG by evaluating the output of the given job that
        contains dynamic output files."""
        dynamic_wildcards = job.dynamic_wildcards
        if not dynamic_wildcards:
            # this happens e.g. in dryrun if output is not yet present
            return

        depending = list(
            filter(lambda job_: not self.finished(job_),
                   self.bfs(self.depending, job)))
        newrule, non_dynamic_wildcards = job.rule.dynamic_branch(
            dynamic_wildcards, input=False)
        self.specialize_rule(job.rule, newrule)

        # no targetfile needed for job
        newjob = Job(newrule, self, format_wildcards=non_dynamic_wildcards)
        self.replace_job(job, newjob)
        for job_ in depending:
            if job_.dynamic_input:
                newrule_ = job_.rule.dynamic_branch(dynamic_wildcards)
                if newrule_ is not None:
                    self.specialize_rule(job_.rule, newrule_)
                    if not self.dynamic(job_):
                        logger.debug("Updating job {}.".format(job_))
                        newjob_ = Job(newrule_,
                                      self,
                                      targetfile=job_.targetfile)

                        unexpected_output = self.reason(
                            job_).missing_output.intersection(
                                newjob.existing_output)
                        if unexpected_output:
                            logger.warning(
                                "Warning: the following output files of rule {} were not "
                                "present when the DAG was created:\n{}".format(
                                    newjob_.rule, unexpected_output))

                        self.replace_job(job_, newjob_)
        return newjob

    def delete_job(self, job, recursive=True, add_dependencies=False):
        """Delete given job from DAG."""
        if job in self.targetjobs:
            self.targetjobs.remove(job)
        if add_dependencies:
            for _job in self.dependencies[job]:
                self.targetjobs.add(_job)
        for job_ in self.depending[job]:
            del self.dependencies[job_][job]
        del self.depending[job]
        for job_ in self.dependencies[job]:
            depending = self.depending[job_]
            del depending[job]
            if not depending and recursive:
                self.delete_job(job_)
        del self.dependencies[job]
        if job in self._needrun:
            self._len -= 1
            self._needrun.remove(job)
            del self._reason[job]
        if job in self._finished:
            self._finished.remove(job)
        if job in self._dynamic:
            self._dynamic.remove(job)
        if job in self._ready_jobs:
            self._ready_jobs.remove(job)

    def replace_job(self, job, newjob):
        """Replace given job with new job."""
        if job in self.targetjobs:
            self.targetjobs.remove(job)
            self.targetjobs.add(newjob)
        depending = list(self.depending[job].items())
        if self.finished(job):
            self._finished.add(newjob)

        self.delete_job(job)
        self.update([newjob])

        for job_, files in depending:
            if not job_.dynamic_input:
                self.dependencies[job_][newjob].update(files)
                self.depending[newjob][job_].update(files)

    def specialize_rule(self, rule, newrule):
        """Specialize the given rule by inserting newrule into the DAG."""
        assert newrule is not None
        self.rules.add(newrule)
        self.update_output_index()

    def collect_potential_dependencies(self, job):
        """Collect all potential dependencies of a job. These might contain
        ambiguities."""
        dependencies = defaultdict(list)
        # use a set to circumvent multiple jobs for the same file
        # if user specified it twice
        file2jobs = self.file2jobs
        for file in set(job.input):
            # omit the file if it comes from a subworkflow
            if file in job.subworkflow_input:
                continue
            try:
                if file in job.dependencies:
                    jobs = [Job(job.dependencies[file], self, targetfile=file)]
                else:
                    jobs = file2jobs(file)
                dependencies[file].extend(jobs)
            except MissingRuleException as ex:
                pass
        return dependencies

    def bfs(self, direction, *jobs, stop=lambda job: False):
        """Perform a breadth-first traversal of the DAG."""
        queue = list(jobs)
        visited = set(queue)
        while queue:
            job = queue.pop(0)
            if stop(job):
                # stop criterion reached for this node
                continue
            yield job
            for job_, _ in direction[job].items():
                if not job_ in visited:
                    queue.append(job_)
                    visited.add(job_)

    def level_bfs(self, direction, *jobs, stop=lambda job: False):
        """Perform a breadth-first traversal of the DAG, but also yield the
        level together with each job."""
        queue = [(job, 0) for job in jobs]
        visited = set(jobs)
        while queue:
            job, level = queue.pop(0)
            if stop(job):
                # stop criterion reached for this node
                continue
            yield level, job
            level += 1
            for job_, _ in direction[job].items():
                if not job_ in visited:
                    queue.append((job_, level))
                    visited.add(job_)

    def dfs(self, direction, *jobs, stop=lambda job: False, post=True):
        """Perform depth-first traversal of the DAG."""
        visited = set()

        def _dfs(job):
            """Inner function for DFS traversal."""
            if stop(job):
                return
            if not post:
                yield job
            for job_ in direction[job]:
                if not job_ in visited:
                    visited.add(job_)
                    for j in _dfs(job_):
                        yield j
            if post:
                yield job

        for job in jobs:
            for job_ in self._dfs(direction,
                                  job,
                                  visited,
                                  stop=stop,
                                  post=post):
                yield job_

    def new_wildcards(self, job):
        """Return wildcards that are newly introduced in this job,
        compared to its ancestors."""
        new_wildcards = set(job.wildcards.items())
        for job_ in self.dependencies[job]:
            if not new_wildcards:
                return set()
            for wildcard in job_.wildcards.items():
                new_wildcards.discard(wildcard)
        return new_wildcards

    def rule2job(self, targetrule):
        """Generate a new job from a given rule."""
        if targetrule.has_wildcards():
            raise WorkflowError(
                "Target rules may not contain wildcards. Please specify concrete files or a rule without wildcards."
            )
        return Job(targetrule, self)

    def file2jobs(self, targetfile):
        rules = self.output_index.match(targetfile)
        jobs = []
        exceptions = list()
        for rule in rules:
            if rule.is_producer(targetfile):
                try:
                    jobs.append(Job(rule, self, targetfile=targetfile))
                except InputFunctionException as e:
                    exceptions.append(e)
        if not jobs:
            if exceptions:
                raise exceptions[0]
            raise MissingRuleException(targetfile)
        return jobs

    def rule_dot2(self):
        dag = defaultdict(list)
        visited = set()
        preselect = set()

        def preselect_parents(job):
            for parent in self.depending[job]:
                if parent in preselect:
                    continue
                preselect.add(parent)
                preselect_parents(parent)

        def build_ruledag(job, key=lambda job: job.rule.name):
            if job in visited:
                return
            visited.add(job)
            deps = sorted(self.dependencies[job], key=key)
            deps = [(group[0] if preselect.isdisjoint(group) else
                     preselect.intersection(group).pop())
                    for group in (list(g) for _, g in groupby(deps, key))]
            dag[job].extend(deps)
            preselect_parents(job)
            for dep in deps:
                build_ruledag(dep)

        for job in self.targetjobs:
            build_ruledag(job)

        return self._dot(dag.keys(),
                         print_wildcards=False,
                         print_types=False,
                         dag=dag)

    def rule_dot(self):
        graph = defaultdict(set)
        for job in self.jobs:
            graph[job.rule].update(dep.rule for dep in self.dependencies[job])
        return self._dot(graph)

    def dot(self):
        def node2style(job):
            if not self.needrun(job):
                return "rounded,dashed"
            if self.dynamic(job) or job.dynamic_input:
                return "rounded,dotted"
            return "rounded"

        def format_wildcard(wildcard):
            name, value = wildcard
            if DYNAMIC_FILL in value:
                value = "..."
            return "{}: {}".format(name, value)

        node2rule = lambda job: job.rule
        node2label = lambda job: "\\n".join(
            chain([job.rule.name],
                  sorted(map(format_wildcard, self.new_wildcards(job)))))

        dag = {job: self.dependencies[job] for job in self.jobs}

        return self._dot(dag,
                         node2rule=node2rule,
                         node2style=node2style,
                         node2label=node2label)

    def _dot(self,
             graph,
             node2rule=lambda node: node,
             node2style=lambda node: "rounded",
             node2label=lambda node: node):

        # color rules
        huefactor = 2 / (3 * len(self.rules))
        rulecolor = {
            rule: "{:.2f} 0.6 0.85".format(i * huefactor)
            for i, rule in enumerate(self.rules)
        }

        # markup
        node_markup = '\t{}[label = "{}", color = "{}", style="{}"];'.format
        edge_markup = "\t{} -> {}".format

        # node ids
        ids = {node: i for i, node in enumerate(graph)}

        # calculate nodes
        nodes = [
            node_markup(ids[node],
                        node2label(node), rulecolor[node2rule(node)],
                        node2style(node)) for node in graph
        ]
        # calculate edges
        edges = [
            edge_markup(ids[dep], ids[node]) for node, deps in graph.items()
            for dep in deps
        ]

        return textwrap.dedent("""\
            digraph snakemake_dag {{
                graph[bgcolor=white, margin=0];
                node[shape=box, style=rounded, fontname=sans, \
                fontsize=10, penwidth=2];
                edge[penwidth=2, color=grey];
            {items}
            }}\
            """).format(items="\n".join(nodes + edges))

    def summary(self, detailed=False):
        if detailed:
            yield "output_file\tdate\trule\tversion\tlog-file(s)\tinput-file(s)\tshellcmd\tstatus\tplan"
        else:
            yield "output_file\tdate\trule\tversion\tlog-file(s)\tstatus\tplan"

        for job in self.jobs:
            output = job.rule.output if self.dynamic(
                job) else job.expanded_output
            for f in output:
                rule = self.workflow.persistence.rule(f)
                rule = "-" if rule is None else rule

                version = self.workflow.persistence.version(f)
                version = "-" if version is None else str(version)

                date = time.ctime(f.mtime) if f.exists else "-"

                pending = "update pending" if self.reason(job) else "no update"

                log = self.workflow.persistence.log(f)
                log = "-" if log is None else ",".join(log)

                input = self.workflow.persistence.input(f)
                input = "-" if input is None else ",".join(input)

                shellcmd = self.workflow.persistence.shellcmd(f)
                shellcmd = "-" if shellcmd is None else shellcmd
                # remove new line characters, leading and trailing whitespace
                shellcmd = shellcmd.strip().replace("\n", "; ")

                status = "ok"
                if not f.exists:
                    status = "missing"
                elif self.reason(job).updated_input:
                    status = "updated input files"
                elif self.workflow.persistence.version_changed(job, file=f):
                    status = "version changed to {}".format(job.rule.version)
                elif self.workflow.persistence.code_changed(job, file=f):
                    status = "rule implementation changed"
                elif self.workflow.persistence.input_changed(job, file=f):
                    status = "set of input files changed"
                elif self.workflow.persistence.params_changed(job, file=f):
                    status = "params changed"
                if detailed:
                    yield "\t".join((f, date, rule, version, log, input,
                                     shellcmd, status, pending))
                else:
                    yield "\t".join(
                        (f, date, rule, version, log, status, pending))

    def d3dag(self, max_jobs=10000):
        def node(job):
            jobid = self.jobid(job)
            return {
                "id": jobid,
                "value": {
                    "jobid": jobid,
                    "label": job.rule.name,
                    "rule": job.rule.name
                }
            }

        def edge(a, b):
            return {"u": self.jobid(a), "v": self.jobid(b)}

        jobs = list(self.jobs)

        if len(jobs) > max_jobs:
            logger.info(
                "Job-DAG is too large for visualization (>{} jobs).".format(
                    max_jobs))
        else:
            logger.d3dag(nodes=[node(job) for job in jobs],
                         edges=[
                             edge(dep, job) for job in jobs
                             for dep in self.dependencies[job]
                             if self.needrun(dep)
                         ])

    def stats(self):
        rules = Counter()
        rules.update(job.rule for job in self.needrun_jobs)
        rules.update(job.rule for job in self.finished_jobs)
        yield "Job counts:"
        yield "\tcount\tjobs"
        for rule, count in sorted(rules.most_common(),
                                  key=lambda item: item[0].name):
            yield "\t{}\t{}".format(count, rule)
        yield "\t{}".format(len(self))

    def __str__(self):
        return self.dot()

    def __len__(self):
        return self._len
Ejemplo n.º 2
0
 def update_output_index(self):
     """Update the OutputIndex."""
     self.output_index = OutputIndex(self.rules)
Ejemplo n.º 3
0
 def update_output_index(self):
     self.output_index = OutputIndex(self.rules)
Ejemplo n.º 4
0
class DAG:
    def __init__(self, workflow,
                 rules=None,
                 dryrun=False,
                 targetfiles=None,
                 targetrules=None,
                 forceall=False,
                 forcerules=None,
                 forcefiles=None,
                 priorityfiles=None,
                 priorityrules=None,
                 ignore_ambiguity=False,
                 force_incomplete=False,
                 ignore_incomplete=False,
                 notemp=False):

        self.dryrun = dryrun
        self.dependencies = defaultdict(partial(defaultdict, set))
        self.depending = defaultdict(partial(defaultdict, set))
        self._needrun = set()
        self._priority = dict()
        self._downstream_size = dict()
        self._reason = defaultdict(Reason)
        self._finished = set()
        self._dynamic = set()
        self._len = 0
        self.workflow = workflow
        self.rules = set(rules)
        self.ignore_ambiguity = ignore_ambiguity
        self.targetfiles = targetfiles
        self.targetrules = targetrules
        self.priorityfiles = priorityfiles
        self.priorityrules = priorityrules
        self.targetjobs = set()
        self.prioritytargetjobs = set()
        self._ready_jobs = set()
        self.notemp = notemp
        self._jobid = dict()

        self.forcerules = set()
        self.forcefiles = set()
        self.updated_subworkflow_files = set()
        if forceall:
            self.forcerules.update(self.rules)
        elif forcerules:
            self.forcerules.update(forcerules)
        if forcefiles:
            self.forcefiles.update(forcefiles)
        self.omitforce = set()

        self.force_incomplete = force_incomplete
        self.ignore_incomplete = ignore_incomplete

        self.periodic_wildcard_detector = PeriodicityDetector()

        self.update_output_index()

    def init(self):
        """ Initialise the DAG. """
        for job in map(self.rule2job, self.targetrules):
            job = self.update([job])
            self.targetjobs.add(job)

        for file in self.targetfiles:
            job = self.update(self.file2jobs(file), file=file)
            self.targetjobs.add(job)

        self.update_needrun()

    def update_output_index(self):
        self.output_index = OutputIndex(self.rules)

    def check_incomplete(self):
        if not self.ignore_incomplete:
            incomplete = self.incomplete_files
            if incomplete:
                if self.force_incomplete:
                    logger.debug("Forcing incomplete files:")
                    logger.debug("\t" + "\n\t".join(incomplete))
                    self.forcefiles.update(incomplete)
                else:
                    raise IncompleteFilesException(incomplete)

    def check_dynamic(self):
        for job in filter(lambda job: (
            job.dynamic_output and not self.needrun(job)
        ), self.jobs):
            self.update_dynamic(job)

    @property
    def dynamic_output_jobs(self):
        return (job for job in self.jobs if job.dynamic_output)

    @property
    def jobs(self):
        """ All jobs in the DAG. """
        for job in self.bfs(self.dependencies, *self.targetjobs):
            yield job

    @property
    def needrun_jobs(self):
        """ Jobs that need to be executed. """
        for job in filter(self.needrun,
                          self.bfs(self.dependencies, *self.targetjobs,
                                   stop=self.noneedrun_finished)):
            yield job

    @property
    def local_needrun_jobs(self):
        return filter(lambda job: self.workflow.is_local(job.rule),
                      self.needrun_jobs)

    @property
    def finished_jobs(self):
        """ Jobs that have been executed. """
        for job in filter(self.finished, self.bfs(self.dependencies,
                                                  *self.targetjobs)):
            yield job

    @property
    def ready_jobs(self):
        """ Jobs that are ready to execute. """
        return self._ready_jobs

    def ready(self, job):
        """ Return whether a given job is ready to execute. """
        return job in self._ready_jobs

    def needrun(self, job):
        """ Return whether a given job needs to be executed. """
        return job in self._needrun

    def priority(self, job):
        return self._priority[job]

    def downstream_size(self, job):
        return self._downstream_size[job]

    def _job_values(self, jobs, values):
        return [values[job] for job in jobs]

    def priorities(self, jobs):
        return self._job_values(jobs, self._priority)

    def downstream_sizes(self, jobs):
        return self._job_values(jobs, self._downstream_size)

    def noneedrun_finished(self, job):
        """
        Return whether a given job is finished or was not
        required to run at all.
        """
        return not self.needrun(job) or self.finished(job)

    def reason(self, job):
        """ Return the reason of the job execution. """
        return self._reason[job]

    def finished(self, job):
        """ Return whether a job is finished. """
        return job in self._finished

    def dynamic(self, job):
        """
        Return whether a job is dynamic (i.e. it is only a placeholder
        for those that are created after the job with dynamic output has
        finished.
        """
        return job in self._dynamic

    def requested_files(self, job):
        """ Return the files a job requests. """
        return set(*self.depending[job].values())

    @property
    def incomplete_files(self):
        return list(chain(*(
            job.output for job in filter(self.workflow.persistence.incomplete,
                                         filterfalse(self.needrun, self.jobs))
        )))

    @property
    def newversion_files(self):
        return list(chain(*(
            job.output
            for job in filter(self.workflow.persistence.newversion, self.jobs)
        )))

    def missing_temp(self, job):
        """
        Return whether a temp file that is input of the given job is missing.
        """
        for job_, files in self.depending[job].items():
            if self.needrun(job_) and any(not f.exists for f in files):
                return True
        return False

    def check_output(self, job, wait=3):
        """ Raise exception if output files of job are missing. """
        try:
            wait_for_files(job.expanded_output, latency_wait=wait)
        except IOError as e:
            raise MissingOutputException(str(e), rule=job.rule)

        input_maxtime = job.input_maxtime
        if input_maxtime is not None:
            output_mintime = job.output_mintime
            if output_mintime is not None and output_mintime < input_maxtime:
                raise RuleException(
                    "Output files {} are older than input "
                    "files. Did you extract an archive? Make sure that output "
                    "files have a more recent modification date than the "
                    "archive, e.g. by using 'touch'.".format(
                        ", ".join(job.expanded_output)),
                    rule=job.rule)

    def check_periodic_wildcards(self, job):
        """ Raise an exception if a wildcard of the given job appears to be periodic,
        indicating a cyclic dependency. """
        for wildcard, value in job.wildcards_dict.items():
            periodic_substring = self.periodic_wildcard_detector.is_periodic(
                value)
            if periodic_substring is not None:
                raise PeriodicWildcardError(
                    "The value {} in wildcard {} is periodically repeated ({}). "
                    "This would lead to an infinite recursion. "
                    "To avoid this, e.g. restrict the wildcards in this rule to certain values.".format(
                        periodic_substring, wildcard, value),
                    rule=job.rule)

    def handle_protected(self, job):
        """ Write-protect output files that are marked with protected(). """
        for f in job.expanded_output:
            if f in job.protected_output:
                logger.info("Write-protecting output file {}.".format(f))
                f.protect()

    def handle_touch(self, job):
        """ Touches those output files that are marked for touching. """
        for f in job.expanded_output:
            if f in job.touch_output:
                logger.info("Touching output file {}.".format(f))
                f.touch_or_create()

    def handle_temp(self, job):
        """ Remove temp files if they are no longer needed. """
        if self.notemp:
            return

        needed = lambda job_, f: any(
            f in files for j, files in self.depending[job_].items()
            if not self.finished(j) and self.needrun(j) and j != job)

        def unneeded_files():
            for job_, files in self.dependencies[job].items():
                for f in job_.temp_output & files:
                    if not needed(job_, f):
                        yield f
            for f in filterfalse(partial(needed, job), job.temp_output):
                if not f in self.targetfiles:
                    yield f

        for f in unneeded_files():
            logger.info("Removing temporary output file {}.".format(f))
            f.remove()

    def jobid(self, job):
        if job not in self._jobid:
            self._jobid[job] = len(self._jobid)
        return self._jobid[job]

    def update(self, jobs, file=None, visited=None, skip_until_dynamic=False):
        """ Update the DAG by adding given jobs and their dependencies. """
        if visited is None:
            visited = set()
        producer = None
        exceptions = list()
        jobs = sorted(jobs, reverse=not self.ignore_ambiguity)
        cycles = list()

        for job in jobs:
            if file in job.input:
                cycles.append(job)
                continue
            if job in visited:
                cycles.append(job)
                continue
            try:
                self.check_periodic_wildcards(job)
                self.update_(job,
                             visited=set(visited),
                             skip_until_dynamic=skip_until_dynamic)
                # TODO this might fail if a rule discarded here is needed
                # elsewhere
                if producer:
                    if job < producer or self.ignore_ambiguity:
                        break
                    elif producer is not None:
                        raise AmbiguousRuleException(file, job, producer)
                producer = job
            except (MissingInputException, CyclicGraphException,
                    PeriodicWildcardError) as ex:
                exceptions.append(ex)
        if producer is None:
            if cycles:
                job = cycles[0]
                raise CyclicGraphException(job.rule, file, rule=job.rule)
            if exceptions:
                raise exceptions[0]
        return producer

    def update_(self, job, visited=None, skip_until_dynamic=False):
        """ Update the DAG by adding the given job and its dependencies. """
        if job in self.dependencies:
            return
        if visited is None:
            visited = set()
        visited.add(job)
        dependencies = self.dependencies[job]
        potential_dependencies = self.collect_potential_dependencies(
            job).items()

        skip_until_dynamic = skip_until_dynamic and not job.dynamic_output

        missing_input = job.missing_input
        producer = dict()
        exceptions = dict()
        for file, jobs in potential_dependencies:
            try:
                producer[file] = self.update(
                    jobs,
                    file=file,
                    visited=visited,
                    skip_until_dynamic=skip_until_dynamic or file in
                    job.dynamic_input)
            except (MissingInputException, CyclicGraphException,
                    PeriodicWildcardError) as ex:
                if file in missing_input:
                    self.delete_job(job,
                                    recursive=False)  # delete job from tree
                    raise ex

        for file, job_ in producer.items():
            dependencies[job_].add(file)
            self.depending[job_][job].add(file)

        missing_input -= producer.keys()
        if missing_input:
            self.delete_job(job, recursive=False)  # delete job from tree
            raise MissingInputException(job.rule, missing_input)

        if skip_until_dynamic:
            self._dynamic.add(job)

    def update_needrun(self):
        """ Update the information whether a job needs to be executed. """

        def output_mintime(job):
            for job_ in self.bfs(self.depending, job):
                t = job_.output_mintime
                if t:
                    return t

        def needrun(job):
            reason = self.reason(job)
            noinitreason = not reason
            updated_subworkflow_input = self.updated_subworkflow_files.intersection(
                job.input)
            if (job not in self.omitforce and job.rule in self.forcerules or
                not self.forcefiles.isdisjoint(job.output)):
                reason.forced = True
            elif updated_subworkflow_input:
                reason.updated_input.update(updated_subworkflow_input)
            elif job in self.targetjobs:
                # TODO find a way to handle added/removed input files here?
                if not job.output and not job.benchmark:
                    if job.input:
                        if job.rule.norun:
                            reason.updated_input_run.update([f
                                                             for f in job.input
                                                             if not f.exists])
                        else:
                            reason.nooutput = True
                    else:
                        reason.noio = True
                else:
                    if job.rule in self.targetrules:
                        missing_output = job.missing_output()
                    else:
                        missing_output = job.missing_output(
                            requested=set(chain(*self.depending[job].values()))
                            | self.targetfiles)
                    reason.missing_output.update(missing_output)
            if not reason:
                output_mintime_ = output_mintime(job)
                if output_mintime_:
                    updated_input = [
                        f for f in job.input
                        if f.exists and f.is_newer(output_mintime_)
                    ]
                    reason.updated_input.update(updated_input)
            if noinitreason and reason:
                reason.derived = False
            return job

        reason = self.reason
        _needrun = self._needrun
        dependencies = self.dependencies
        depending = self.depending

        _needrun.clear()
        candidates = set(self.jobs)

        queue = list(filter(reason, map(needrun, candidates)))
        visited = set(queue)
        while queue:
            job = queue.pop(0)
            _needrun.add(job)

            for job_, files in dependencies[job].items():
                missing_output = job_.missing_output(requested=files)
                reason(job_).missing_output.update(missing_output)
                if missing_output and not job_ in visited:
                    visited.add(job_)
                    queue.append(job_)

            for job_, files in depending[job].items():
                if job_ in candidates:
                    reason(job_).updated_input_run.update(files)
                    if not job_ in visited:
                        visited.add(job_)
                        queue.append(job_)

        self._len = len(_needrun)

    def update_priority(self):
        """ Update job priorities. """
        prioritized = (lambda job: job.rule in self.priorityrules or
                       not self.priorityfiles.isdisjoint(job.output))
        for job in self.needrun_jobs:
            self._priority[job] = job.rule.priority
        for job in self.bfs(self.dependencies,
                            *filter(prioritized, self.needrun_jobs),
                            stop=self.noneedrun_finished):
            self._priority[job] = Job.HIGHEST_PRIORITY

    def update_ready(self):
        """ Update information whether a job is ready to execute. """
        for job in filter(self.needrun, self.jobs):
            if not self.finished(job) and self._ready(job):
                self._ready_jobs.add(job)

    def update_downstream_size(self):
        for job in self.needrun_jobs:
            self._downstream_size[job] = sum(
                1 for _ in self.bfs(self.depending, job,
                                    stop=self.noneedrun_finished)) - 1

    def postprocess(self):
        self.update_needrun()
        self.update_priority()
        self.update_ready()
        self.update_downstream_size()

    def _ready(self, job):
        return self._finished.issuperset(
            filter(self.needrun, self.dependencies[job]))

    def finish(self, job, update_dynamic=True):
        self._finished.add(job)
        try:
            self._ready_jobs.remove(job)
        except KeyError:
            pass
        # mark depending jobs as ready
        for job_ in self.depending[job]:
            if self.needrun(job_) and self._ready(job_):
                self._ready_jobs.add(job_)

        if update_dynamic and job.dynamic_output:
            logger.info("Dynamically updating jobs")
            newjob = self.update_dynamic(job)
            if newjob:
                # simulate that this job ran and was finished before
                self.omitforce.add(newjob)
                self._needrun.add(newjob)
                self._finished.add(newjob)

                self.postprocess()
                self.handle_protected(newjob)
                self.handle_touch(newjob)
                # add finished jobs to len as they are not counted after new postprocess
                self._len += len(self._finished)

    def update_dynamic(self, job):
        dynamic_wildcards = job.dynamic_wildcards
        if not dynamic_wildcards:
            # this happens e.g. in dryrun if output is not yet present
            return

        depending = list(filter(lambda job_: not self.finished(job_),
                                self.bfs(self.depending, job)))
        newrule, non_dynamic_wildcards = job.rule.dynamic_branch(
            dynamic_wildcards,
            input=False)
        self.specialize_rule(job.rule, newrule)

        # no targetfile needed for job
        newjob = Job(newrule, self, format_wildcards=non_dynamic_wildcards)
        self.replace_job(job, newjob)
        for job_ in depending:
            if job_.dynamic_input:
                newrule_ = job_.rule.dynamic_branch(dynamic_wildcards)
                if newrule_ is not None:
                    self.specialize_rule(job_.rule, newrule_)
                    if not self.dynamic(job_):
                        logger.debug("Updating job {}.".format(job_))
                        newjob_ = Job(newrule_, self,
                                      targetfile=job_.targetfile)

                        unexpected_output = self.reason(
                            job_).missing_output.intersection(
                                newjob.existing_output)
                        if unexpected_output:
                            logger.warning(
                                "Warning: the following output files of rule {} were not "
                                "present when the DAG was created:\n{}".format(
                                    newjob_.rule, unexpected_output))

                        self.replace_job(job_, newjob_)
        return newjob

    def delete_job(self, job, recursive=True):
        for job_ in self.depending[job]:
            del self.dependencies[job_][job]
        del self.depending[job]
        for job_ in self.dependencies[job]:
            depending = self.depending[job_]
            del depending[job]
            if not depending and recursive:
                self.delete_job(job_)
        del self.dependencies[job]
        if job in self._needrun:
            self._len -= 1
            self._needrun.remove(job)
            del self._reason[job]
        if job in self._finished:
            self._finished.remove(job)
        if job in self._dynamic:
            self._dynamic.remove(job)
        if job in self._ready_jobs:
            self._ready_jobs.remove(job)

    def replace_job(self, job, newjob):
        depending = list(self.depending[job].items())
        if self.finished(job):
            self._finished.add(newjob)

        self.delete_job(job)
        self.update([newjob])

        for job_, files in depending:
            if not job_.dynamic_input:
                self.dependencies[job_][newjob].update(files)
                self.depending[newjob][job_].update(files)
        if job in self.targetjobs:
            self.targetjobs.remove(job)
            self.targetjobs.add(newjob)

    def specialize_rule(self, rule, newrule):
        assert newrule is not None
        self.rules.add(newrule)
        self.update_output_index()

    def collect_potential_dependencies(self, job):
        dependencies = defaultdict(list)
        # use a set to circumvent multiple jobs for the same file
        # if user specified it twice
        file2jobs = self.file2jobs
        for file in set(job.input):
            # omit the file if it comes from a subworkflow
            if file in job.subworkflow_input:
                continue
            try:
                if file in job.dependencies:
                    jobs = [Job(job.dependencies[file], self, targetfile=file)]
                else:
                    jobs = file2jobs(file)
                dependencies[file].extend(jobs)
            except MissingRuleException as ex:
                pass
        return dependencies

    def bfs(self, direction, *jobs, stop=lambda job: False):
        queue = list(jobs)
        visited = set(queue)
        while queue:
            job = queue.pop(0)
            if stop(job):
                # stop criterion reached for this node
                continue
            yield job
            for job_, _ in direction[job].items():
                if not job_ in visited:
                    queue.append(job_)
                    visited.add(job_)

    def level_bfs(self, direction, *jobs, stop=lambda job: False):
        queue = [(job, 0) for job in jobs]
        visited = set(jobs)
        while queue:
            job, level = queue.pop(0)
            if stop(job):
                # stop criterion reached for this node
                continue
            yield level, job
            level += 1
            for job_, _ in direction[job].items():
                if not job_ in visited:
                    queue.append((job_, level))
                    visited.add(job_)

    def dfs(self, direction, *jobs, stop=lambda job: False, post=True):
        visited = set()
        for job in jobs:
            for job_ in self._dfs(direction, job, visited,
                                  stop=stop,
                                  post=post):
                yield job_

    def _dfs(self, direction, job, visited, stop, post):
        if stop(job):
            return
        if not post:
            yield job
        for job_ in direction[job]:
            if not job_ in visited:
                visited.add(job_)
                for j in self._dfs(direction, job_, visited, stop, post):
                    yield j
        if post:
            yield job

    def is_isomorph(self, job1, job2):
        if job1.rule != job2.rule:
            return False
        rule = lambda job: job.rule.name
        queue1, queue2 = [job1], [job2]
        visited1, visited2 = set(queue1), set(queue2)
        while queue1 and queue2:
            job1, job2 = queue1.pop(0), queue2.pop(0)
            deps1 = sorted(self.dependencies[job1], key=rule)
            deps2 = sorted(self.dependencies[job2], key=rule)
            for job1_, job2_ in zip(deps1, deps2):
                if job1_.rule != job2_.rule:
                    return False
                if not job1_ in visited1 and not job2_ in visited2:
                    queue1.append(job1_)
                    visited1.add(job1_)
                    queue2.append(job2_)
                    visited2.add(job2_)
                elif not (job1_ in visited1 and job2_ in visited2):
                    return False
        return True

    def all_longest_paths(self, *jobs):
        paths = defaultdict(list)

        def all_longest_paths(_jobs):
            for job in _jobs:
                if job in paths:
                    continue
                deps = self.dependencies[job]
                if not deps:
                    paths[job].append([job])
                    continue
                all_longest_paths(deps)
                for _job in deps:
                    paths[job].extend(path + [job] for path in paths[_job])

        all_longest_paths(jobs)
        return chain(*(paths[job] for job in jobs))

    def new_wildcards(self, job):
        new_wildcards = set(job.wildcards.items())
        for job_ in self.dependencies[job]:
            if not new_wildcards:
                return set()
            for wildcard in job_.wildcards.items():
                new_wildcards.discard(wildcard)
        return new_wildcards

    def rule2job(self, targetrule):
        return Job(targetrule, self)

    def file2jobs(self, targetfile):
        rules = self.output_index.match(targetfile)
        jobs = []
        exceptions = list()
        for rule in rules:
            if rule.is_producer(targetfile):
                try:
                    jobs.append(Job(rule, self, targetfile=targetfile))
                except InputFunctionException as e:
                    exceptions.append(e)
        if not jobs:
            if exceptions:
                raise exceptions[0]
            raise MissingRuleException(targetfile)
        return jobs

    def rule_dot2(self):
        dag = defaultdict(list)
        visited = set()
        preselect = set()

        def preselect_parents(job):
            for parent in self.depending[job]:
                if parent in preselect:
                    continue
                preselect.add(parent)
                preselect_parents(parent)

        def build_ruledag(job, key=lambda job: job.rule.name):
            if job in visited:
                return
            visited.add(job)
            deps = sorted(self.dependencies[job], key=key)
            deps = [(group[0] if preselect.isdisjoint(group) else
                     preselect.intersection(group).pop())
                    for group in (list(g) for _, g in groupby(deps, key))]
            dag[job].extend(deps)
            preselect_parents(job)
            for dep in deps:
                build_ruledag(dep)

        for job in self.targetjobs:
            build_ruledag(job)

        return self._dot(dag.keys(),
                         print_wildcards=False,
                         print_types=False,
                         dag=dag)

    def rule_dot(self):
        graph = defaultdict(set)
        for job in self.jobs:
            graph[job.rule].update(dep.rule for dep in self.dependencies[job])
        return self._dot(graph)

    def dot(self):
        def node2style(job):
            if not self.needrun(job):
                return "rounded,dashed"
            if self.dynamic(job) or job.dynamic_input:
                return "rounded,dotted"
            return "rounded"

        def format_wildcard(wildcard):
            name, value = wildcard
            if _IOFile.dynamic_fill in value:
                value = "..."
            return "{}: {}".format(name, value)

        node2rule = lambda job: job.rule
        node2label = lambda job: "\\n".join(chain([
            job.rule.name
        ], sorted(map(format_wildcard, self.new_wildcards(job)))))

        dag = {job: self.dependencies[job] for job in self.jobs}

        return self._dot(dag,
                         node2rule=node2rule,
                         node2style=node2style,
                         node2label=node2label)

    def _dot(self, graph,
             node2rule=lambda node: node,
             node2style=lambda node: "rounded",
             node2label=lambda node: node):

        # color rules
        huefactor = 2 / (3 * len(self.rules))
        rulecolor = {
            rule: "{:.2f} 0.6 0.85".format(i * huefactor)
            for i, rule in enumerate(self.rules)
        }

        # markup
        node_markup = '\t{}[label = "{}", color = "{}", style="{}"];'.format
        edge_markup = "\t{} -> {}".format

        # node ids
        ids = {node: i for i, node in enumerate(graph)}

        # calculate nodes
        nodes = [node_markup(ids[node], node2label(node),
                             rulecolor[node2rule(node)], node2style(node))
                 for node in graph]
        # calculate edges
        edges = [edge_markup(ids[dep], ids[node])
                 for node, deps in graph.items() for dep in deps]

        return textwrap.dedent("""\
            digraph snakemake_dag {{
                graph[bgcolor=white, margin=0];
                node[shape=box, style=rounded, fontname=sans, \
                fontsize=10, penwidth=2];
                edge[penwidth=2, color=grey];
            {items}
            }}\
            """).format(items="\n".join(nodes + edges))

    def summary(self, detailed=False):
        if detailed:
            yield "output_file\tdate\trule\tversion\tinput_file(s)\tshellcmd\tstatus\tplan"
        else:
            yield "output_file\tdate\trule\tversion\tstatus\tplan"

        for job in self.jobs:
            output = job.rule.output if self.dynamic(
                job) else job.expanded_output
            for f in output:
                rule = self.workflow.persistence.rule(f)
                rule = "-" if rule is None else rule

                version = self.workflow.persistence.version(f)
                version = "-" if version is None else str(version)

                date = time.ctime(f.mtime) if f.exists else "-"

                pending = "update pending" if self.reason(job) else "no update"

                input = self.workflow.persistence.input(f)
                input = "-" if input is None else ",".join(input)

                shellcmd = self.workflow.persistence.shellcmd(f)
                shellcmd = "-" if shellcmd is None else shellcmd
                # remove new line characters, leading and trailing whitespace
                shellcmd = shellcmd.strip().replace("\n", "; ")

                status = "ok"
                if not f.exists:
                    status = "missing"
                elif self.reason(job).updated_input:
                    status = "updated input files"
                elif self.workflow.persistence.version_changed(job, file=f):
                    status = "version changed to {}".format(job.rule.version)
                elif self.workflow.persistence.code_changed(job, file=f):
                    status = "rule implementation changed"
                elif self.workflow.persistence.input_changed(job, file=f):
                    status = "set of input files changed"
                elif self.workflow.persistence.params_changed(job, file=f):
                    status = "params changed"
                if detailed:
                    yield "\t".join((f, date, rule, version, input, shellcmd,
                                     status, pending))
                else:
                    yield "\t".join((f, date, rule, version, status, pending))

    def d3dag(self, max_jobs=10000):
        def node(job):
            jobid = self.jobid(job)
            return {
                "id": jobid,
                "value": {
                    "jobid": jobid,
                    "label": job.rule.name,
                    "rule": job.rule.name
                }
            }

        def edge(a, b):
            return {"u": self.jobid(a), "v": self.jobid(b)}

        jobs = list(self.jobs)

        if len(jobs) > max_jobs:
            logger.info(
                "Job-DAG is too large for visualization (>{} jobs).".format(
                    max_jobs))
        else:
            logger.d3dag(nodes=[node(job) for job in jobs],
                         edges=[edge(dep, job) for job in jobs for dep in
                                self.dependencies[job] if self.needrun(dep)])

    def stats(self):
        rules = Counter()
        rules.update(job.rule for job in self.needrun_jobs)
        rules.update(job.rule for job in self.finished_jobs)
        yield "Job counts:"
        yield "\tcount\tjobs"
        for rule, count in sorted(rules.most_common(),
                                  key=lambda item: item[0].name):
            yield "\t{}\t{}".format(count, rule)
        yield "\t{}".format(len(self))

    def __str__(self):
        return self.dot()

    def __len__(self):
        return self._len
Ejemplo n.º 5
0
 def update_output_index(self):
     self.output_index = OutputIndex(self.rules)
Ejemplo n.º 6
0
class DAG:
    def __init__(self,
                 workflow,
                 rules=None,
                 dryrun=False,
                 targetfiles=None,
                 targetrules=None,
                 forceall=False,
                 forcerules=None,
                 forcefiles=None,
                 priorityfiles=None,
                 priorityrules=None,
                 ignore_ambiguity=False,
                 force_incomplete=False,
                 ignore_incomplete=False,
                 notemp=False):

        self.dryrun = dryrun
        self.dependencies = defaultdict(partial(defaultdict, set))
        self.depending = defaultdict(partial(defaultdict, set))
        self._needrun = set()
        self._priority = dict()
        self._downstream_size = dict()
        self._reason = defaultdict(Reason)
        self._finished = set()
        self._dynamic = set()
        self._len = 0
        self.workflow = workflow
        self.rules = set(rules)
        self.ignore_ambiguity = ignore_ambiguity
        self.targetfiles = targetfiles
        self.targetrules = targetrules
        self.priorityfiles = priorityfiles
        self.priorityrules = priorityrules
        self.targetjobs = set()
        self.prioritytargetjobs = set()
        self._ready_jobs = set()
        self.notemp = notemp
        self._jobid = dict()

        self.forcerules = set()
        self.forcefiles = set()
        self.updated_subworkflow_files = set()
        if forceall:
            self.forcerules.update(self.rules)
        elif forcerules:
            self.forcerules.update(forcerules)
        if forcefiles:
            self.forcefiles.update(forcefiles)
        self.omitforce = set()

        self.force_incomplete = force_incomplete
        self.ignore_incomplete = ignore_incomplete

        self.periodic_wildcard_detector = PeriodicityDetector()

        self.update_output_index()

    def init(self):
        """ Initialise the DAG. """
        for job in map(self.rule2job, self.targetrules):
            job = self.update([job])
            self.targetjobs.add(job)

        for file in self.targetfiles:
            job = self.update(self.file2jobs(file), file=file)
            self.targetjobs.add(job)

        self.update_needrun()

    def update_output_index(self):
        self.output_index = OutputIndex(self.rules)

    def check_incomplete(self):
        if not self.ignore_incomplete:
            incomplete = self.incomplete_files
            if incomplete:
                if self.force_incomplete:
                    logger.debug("Forcing incomplete files:")
                    logger.debug("\t" + "\n\t".join(incomplete))
                    self.forcefiles.update(incomplete)
                else:
                    raise IncompleteFilesException(incomplete)

    def check_dynamic(self):
        for job in filter(
                lambda job: (job.dynamic_output and not self.needrun(job)),
                self.jobs):
            self.update_dynamic(job)

    @property
    def dynamic_output_jobs(self):
        return (job for job in self.jobs if job.dynamic_output)

    @property
    def jobs(self):
        """ All jobs in the DAG. """
        for job in self.bfs(self.dependencies, *self.targetjobs):
            yield job

    @property
    def needrun_jobs(self):
        """ Jobs that need to be executed. """
        for job in filter(
                self.needrun,
                self.bfs(self.dependencies,
                         *self.targetjobs,
                         stop=self.noneedrun_finished)):
            yield job

    @property
    def local_needrun_jobs(self):
        return filter(lambda job: self.workflow.is_local(job.rule),
                      self.needrun_jobs)

    @property
    def finished_jobs(self):
        """ Jobs that have been executed. """
        for job in filter(self.finished,
                          self.bfs(self.dependencies, *self.targetjobs)):
            yield job

    @property
    def ready_jobs(self):
        """ Jobs that are ready to execute. """
        return self._ready_jobs

    def ready(self, job):
        """ Return whether a given job is ready to execute. """
        return job in self._ready_jobs

    def needrun(self, job):
        """ Return whether a given job needs to be executed. """
        return job in self._needrun

    def priority(self, job):
        return self._priority[job]

    def downstream_size(self, job):
        return self._downstream_size[job]

    def _job_values(self, jobs, values):
        return [values[job] for job in jobs]

    def priorities(self, jobs):
        return self._job_values(jobs, self._priority)

    def downstream_sizes(self, jobs):
        return self._job_values(jobs, self._downstream_size)

    def noneedrun_finished(self, job):
        """
        Return whether a given job is finished or was not
        required to run at all.
        """
        return not self.needrun(job) or self.finished(job)

    def reason(self, job):
        """ Return the reason of the job execution. """
        return self._reason[job]

    def finished(self, job):
        """ Return whether a job is finished. """
        return job in self._finished

    def dynamic(self, job):
        """
        Return whether a job is dynamic (i.e. it is only a placeholder
        for those that are created after the job with dynamic output has
        finished.
        """
        return job in self._dynamic

    def requested_files(self, job):
        """ Return the files a job requests. """
        return set(*self.depending[job].values())

    @property
    def incomplete_files(self):
        return list(
            chain(*(job.output
                    for job in filter(self.workflow.persistence.incomplete,
                                      filterfalse(self.needrun, self.jobs)))))

    @property
    def newversion_files(self):
        return list(
            chain(*(job.output for job in filter(
                self.workflow.persistence.newversion, self.jobs))))

    def missing_temp(self, job):
        """
        Return whether a temp file that is input of the given job is missing.
        """
        for job_, files in self.depending[job].items():
            if self.needrun(job_) and any(not f.exists for f in files):
                return True
        return False

    def check_output(self, job, wait=3):
        """ Raise exception if output files of job are missing. """
        try:
            wait_for_files(job.expanded_output, latency_wait=wait)
        except IOError as e:
            raise MissingOutputException(str(e), rule=job.rule)

        input_maxtime = job.input_maxtime
        if input_maxtime is not None:
            output_mintime = job.output_mintime
            if output_mintime is not None and output_mintime < input_maxtime:
                raise RuleException(
                    "Output files {} are older than input "
                    "files. Did you extract an archive? Make sure that output "
                    "files have a more recent modification date than the "
                    "archive, e.g. by using 'touch'.".format(", ".join(
                        job.expanded_output)),
                    rule=job.rule)

    def check_periodic_wildcards(self, job):
        """ Raise an exception if a wildcard of the given job appears to be periodic,
        indicating a cyclic dependency. """
        for wildcard, value in job.wildcards_dict.items():
            periodic_substring = self.periodic_wildcard_detector.is_periodic(
                value)
            if periodic_substring is not None:
                raise PeriodicWildcardError(
                    "The value {} in wildcard {} is periodically repeated ({}). "
                    "This would lead to an infinite recursion. "
                    "To avoid this, e.g. restrict the wildcards in this rule to certain values."
                    .format(periodic_substring, wildcard, value),
                    rule=job.rule)

    def handle_protected(self, job):
        """ Write-protect output files that are marked with protected(). """
        for f in job.expanded_output:
            if f in job.protected_output:
                logger.info("Write-protecting output file {}.".format(f))
                f.protect()

    def handle_touch(self, job):
        """ Touches those output files that are marked for touching. """
        for f in job.expanded_output:
            if f in job.touch_output:
                logger.info("Touching output file {}.".format(f))
                f.touch_or_create()

    def handle_temp(self, job):
        """ Remove temp files if they are no longer needed. """
        if self.notemp:
            return

        needed = lambda job_, f: any(
            f in files for j, files in self.depending[job_].items()
            if not self.finished(j) and self.needrun(j) and j != job)

        def unneeded_files():
            for job_, files in self.dependencies[job].items():
                for f in job_.temp_output & files:
                    if not needed(job_, f):
                        yield f
            for f in filterfalse(partial(needed, job), job.temp_output):
                if not f in self.targetfiles:
                    yield f

        for f in unneeded_files():
            logger.info("Removing temporary output file {}.".format(f))
            f.remove()

    def jobid(self, job):
        if job not in self._jobid:
            self._jobid[job] = len(self._jobid)
        return self._jobid[job]

    def update(self, jobs, file=None, visited=None, skip_until_dynamic=False):
        """ Update the DAG by adding given jobs and their dependencies. """
        if visited is None:
            visited = set()
        producer = None
        exceptions = list()
        jobs = sorted(jobs, reverse=not self.ignore_ambiguity)
        cycles = list()

        for job in jobs:
            if file in job.input:
                cycles.append(job)
                continue
            if job in visited:
                cycles.append(job)
                continue
            try:
                self.check_periodic_wildcards(job)
                self.update_(job,
                             visited=set(visited),
                             skip_until_dynamic=skip_until_dynamic)
                # TODO this might fail if a rule discarded here is needed
                # elsewhere
                if producer:
                    if job < producer or self.ignore_ambiguity:
                        break
                    elif producer is not None:
                        raise AmbiguousRuleException(file, job, producer)
                producer = job
            except (MissingInputException, CyclicGraphException,
                    PeriodicWildcardError) as ex:
                exceptions.append(ex)
        if producer is None:
            if cycles:
                job = cycles[0]
                raise CyclicGraphException(job.rule, file, rule=job.rule)
            if exceptions:
                raise exceptions[0]
        return producer

    def update_(self, job, visited=None, skip_until_dynamic=False):
        """ Update the DAG by adding the given job and its dependencies. """
        if job in self.dependencies:
            return
        if visited is None:
            visited = set()
        visited.add(job)
        dependencies = self.dependencies[job]
        potential_dependencies = self.collect_potential_dependencies(
            job).items()

        skip_until_dynamic = skip_until_dynamic and not job.dynamic_output

        missing_input = job.missing_input
        producer = dict()
        exceptions = dict()
        for file, jobs in potential_dependencies:
            try:
                producer[file] = self.update(
                    jobs,
                    file=file,
                    visited=visited,
                    skip_until_dynamic=skip_until_dynamic
                    or file in job.dynamic_input)
            except (MissingInputException, CyclicGraphException,
                    PeriodicWildcardError) as ex:
                if file in missing_input:
                    self.delete_job(job,
                                    recursive=False)  # delete job from tree
                    raise ex

        for file, job_ in producer.items():
            dependencies[job_].add(file)
            self.depending[job_][job].add(file)

        missing_input -= producer.keys()
        if missing_input:
            self.delete_job(job, recursive=False)  # delete job from tree
            raise MissingInputException(job.rule, missing_input)

        if skip_until_dynamic:
            self._dynamic.add(job)

    def update_needrun(self):
        """ Update the information whether a job needs to be executed. """
        def output_mintime(job):
            for job_ in self.bfs(self.depending, job):
                t = job_.output_mintime
                if t:
                    return t

        def needrun(job):
            reason = self.reason(job)
            noinitreason = not reason
            updated_subworkflow_input = self.updated_subworkflow_files.intersection(
                job.input)
            if (job not in self.omitforce and job.rule in self.forcerules
                    or not self.forcefiles.isdisjoint(job.output)):
                reason.forced = True
            elif updated_subworkflow_input:
                reason.updated_input.update(updated_subworkflow_input)
            elif job in self.targetjobs:
                # TODO find a way to handle added/removed input files here?
                if not job.output and not job.benchmark:
                    if job.input:
                        if job.rule.norun:
                            reason.updated_input_run.update(
                                [f for f in job.input if not f.exists])
                        else:
                            reason.nooutput = True
                    else:
                        reason.noio = True
                else:
                    if job.rule in self.targetrules:
                        missing_output = job.missing_output()
                    else:
                        missing_output = job.missing_output(
                            requested=set(chain(*self.depending[job].values()))
                            | self.targetfiles)
                    reason.missing_output.update(missing_output)
            if not reason:
                output_mintime_ = output_mintime(job)
                if output_mintime_:
                    updated_input = [
                        f for f in job.input
                        if f.exists and f.is_newer(output_mintime_)
                    ]
                    reason.updated_input.update(updated_input)
            if noinitreason and reason:
                reason.derived = False
            return job

        reason = self.reason
        _needrun = self._needrun
        dependencies = self.dependencies
        depending = self.depending

        _needrun.clear()
        candidates = set(self.jobs)

        queue = list(filter(reason, map(needrun, candidates)))
        visited = set(queue)
        while queue:
            job = queue.pop(0)
            _needrun.add(job)

            for job_, files in dependencies[job].items():
                missing_output = job_.missing_output(requested=files)
                reason(job_).missing_output.update(missing_output)
                if missing_output and not job_ in visited:
                    visited.add(job_)
                    queue.append(job_)

            for job_, files in depending[job].items():
                if job_ in candidates:
                    reason(job_).updated_input_run.update(files)
                    if not job_ in visited:
                        visited.add(job_)
                        queue.append(job_)

        self._len = len(_needrun)

    def update_priority(self):
        """ Update job priorities. """
        prioritized = (lambda job: job.rule in self.priorityrules or not self.
                       priorityfiles.isdisjoint(job.output))
        for job in self.needrun_jobs:
            self._priority[job] = job.rule.priority
        for job in self.bfs(self.dependencies,
                            *filter(prioritized, self.needrun_jobs),
                            stop=self.noneedrun_finished):
            self._priority[job] = Job.HIGHEST_PRIORITY

    def update_ready(self):
        """ Update information whether a job is ready to execute. """
        for job in filter(self.needrun, self.jobs):
            if not self.finished(job) and self._ready(job):
                self._ready_jobs.add(job)

    def update_downstream_size(self):
        for job in self.needrun_jobs:
            self._downstream_size[job] = sum(1 for _ in self.bfs(
                self.depending, job, stop=self.noneedrun_finished)) - 1

    def postprocess(self):
        self.update_needrun()
        self.update_priority()
        self.update_ready()
        self.update_downstream_size()

    def _ready(self, job):
        return self._finished.issuperset(
            filter(self.needrun, self.dependencies[job]))

    def finish(self, job, update_dynamic=True):
        self._finished.add(job)
        try:
            self._ready_jobs.remove(job)
        except KeyError:
            pass
        # mark depending jobs as ready
        for job_ in self.depending[job]:
            if self.needrun(job_) and self._ready(job_):
                self._ready_jobs.add(job_)

        if update_dynamic and job.dynamic_output:
            logger.info("Dynamically updating jobs")
            newjob = self.update_dynamic(job)
            if newjob:
                # simulate that this job ran and was finished before
                self.omitforce.add(newjob)
                self._needrun.add(newjob)
                self._finished.add(newjob)

                self.postprocess()
                self.handle_protected(newjob)
                self.handle_touch(newjob)
                # add finished jobs to len as they are not counted after new postprocess
                self._len += len(self._finished)

    def update_dynamic(self, job):
        dynamic_wildcards = job.dynamic_wildcards
        if not dynamic_wildcards:
            # this happens e.g. in dryrun if output is not yet present
            return

        depending = list(
            filter(lambda job_: not self.finished(job_),
                   self.bfs(self.depending, job)))
        newrule, non_dynamic_wildcards = job.rule.dynamic_branch(
            dynamic_wildcards, input=False)
        self.specialize_rule(job.rule, newrule)

        # no targetfile needed for job
        newjob = Job(newrule, self, format_wildcards=non_dynamic_wildcards)
        self.replace_job(job, newjob)
        for job_ in depending:
            if job_.dynamic_input:
                newrule_ = job_.rule.dynamic_branch(dynamic_wildcards)
                if newrule_ is not None:
                    self.specialize_rule(job_.rule, newrule_)
                    if not self.dynamic(job_):
                        logger.debug("Updating job {}.".format(job_))
                        newjob_ = Job(newrule_,
                                      self,
                                      targetfile=job_.targetfile)

                        unexpected_output = self.reason(
                            job_).missing_output.intersection(
                                newjob.existing_output)
                        if unexpected_output:
                            logger.warning(
                                "Warning: the following output files of rule {} were not "
                                "present when the DAG was created:\n{}".format(
                                    newjob_.rule, unexpected_output))

                        self.replace_job(job_, newjob_)
        return newjob

    def delete_job(self, job, recursive=True):
        for job_ in self.depending[job]:
            del self.dependencies[job_][job]
        del self.depending[job]
        for job_ in self.dependencies[job]:
            depending = self.depending[job_]
            del depending[job]
            if not depending and recursive:
                self.delete_job(job_)
        del self.dependencies[job]
        if job in self._needrun:
            self._len -= 1
            self._needrun.remove(job)
            del self._reason[job]
        if job in self._finished:
            self._finished.remove(job)
        if job in self._dynamic:
            self._dynamic.remove(job)
        if job in self._ready_jobs:
            self._ready_jobs.remove(job)

    def replace_job(self, job, newjob):
        depending = list(self.depending[job].items())
        if self.finished(job):
            self._finished.add(newjob)

        self.delete_job(job)
        self.update([newjob])

        for job_, files in depending:
            if not job_.dynamic_input:
                self.dependencies[job_][newjob].update(files)
                self.depending[newjob][job_].update(files)
        if job in self.targetjobs:
            self.targetjobs.remove(job)
            self.targetjobs.add(newjob)

    def specialize_rule(self, rule, newrule):
        assert newrule is not None
        self.rules.add(newrule)
        self.update_output_index()

    def collect_potential_dependencies(self, job):
        dependencies = defaultdict(list)
        # use a set to circumvent multiple jobs for the same file
        # if user specified it twice
        file2jobs = self.file2jobs
        for file in set(job.input):
            # omit the file if it comes from a subworkflow
            if file in job.subworkflow_input:
                continue
            try:
                if file in job.dependencies:
                    jobs = [Job(job.dependencies[file], self, targetfile=file)]
                else:
                    jobs = file2jobs(file)
                dependencies[file].extend(jobs)
            except MissingRuleException as ex:
                pass
        return dependencies

    def bfs(self, direction, *jobs, stop=lambda job: False):
        queue = list(jobs)
        visited = set(queue)
        while queue:
            job = queue.pop(0)
            if stop(job):
                # stop criterion reached for this node
                continue
            yield job
            for job_, _ in direction[job].items():
                if not job_ in visited:
                    queue.append(job_)
                    visited.add(job_)

    def level_bfs(self, direction, *jobs, stop=lambda job: False):
        queue = [(job, 0) for job in jobs]
        visited = set(jobs)
        while queue:
            job, level = queue.pop(0)
            if stop(job):
                # stop criterion reached for this node
                continue
            yield level, job
            level += 1
            for job_, _ in direction[job].items():
                if not job_ in visited:
                    queue.append((job_, level))
                    visited.add(job_)

    def dfs(self, direction, *jobs, stop=lambda job: False, post=True):
        visited = set()
        for job in jobs:
            for job_ in self._dfs(direction,
                                  job,
                                  visited,
                                  stop=stop,
                                  post=post):
                yield job_

    def _dfs(self, direction, job, visited, stop, post):
        if stop(job):
            return
        if not post:
            yield job
        for job_ in direction[job]:
            if not job_ in visited:
                visited.add(job_)
                for j in self._dfs(direction, job_, visited, stop, post):
                    yield j
        if post:
            yield job

    def is_isomorph(self, job1, job2):
        if job1.rule != job2.rule:
            return False
        rule = lambda job: job.rule.name
        queue1, queue2 = [job1], [job2]
        visited1, visited2 = set(queue1), set(queue2)
        while queue1 and queue2:
            job1, job2 = queue1.pop(0), queue2.pop(0)
            deps1 = sorted(self.dependencies[job1], key=rule)
            deps2 = sorted(self.dependencies[job2], key=rule)
            for job1_, job2_ in zip(deps1, deps2):
                if job1_.rule != job2_.rule:
                    return False
                if not job1_ in visited1 and not job2_ in visited2:
                    queue1.append(job1_)
                    visited1.add(job1_)
                    queue2.append(job2_)
                    visited2.add(job2_)
                elif not (job1_ in visited1 and job2_ in visited2):
                    return False
        return True

    def all_longest_paths(self, *jobs):
        paths = defaultdict(list)

        def all_longest_paths(_jobs):
            for job in _jobs:
                if job in paths:
                    continue
                deps = self.dependencies[job]
                if not deps:
                    paths[job].append([job])
                    continue
                all_longest_paths(deps)
                for _job in deps:
                    paths[job].extend(path + [job] for path in paths[_job])

        all_longest_paths(jobs)
        return chain(*(paths[job] for job in jobs))

    def new_wildcards(self, job):
        new_wildcards = set(job.wildcards.items())
        for job_ in self.dependencies[job]:
            if not new_wildcards:
                return set()
            for wildcard in job_.wildcards.items():
                new_wildcards.discard(wildcard)
        return new_wildcards

    def rule2job(self, targetrule):
        return Job(targetrule, self)

    def file2jobs(self, targetfile):
        rules = self.output_index.match(targetfile)
        jobs = []
        exceptions = list()
        for rule in rules:
            if rule.is_producer(targetfile):
                try:
                    jobs.append(Job(rule, self, targetfile=targetfile))
                except InputFunctionException as e:
                    exceptions.append(e)
        if not jobs:
            if exceptions:
                raise exceptions[0]
            raise MissingRuleException(targetfile)
        return jobs

    def rule_dot2(self):
        dag = defaultdict(list)
        visited = set()
        preselect = set()

        def preselect_parents(job):
            for parent in self.depending[job]:
                if parent in preselect:
                    continue
                preselect.add(parent)
                preselect_parents(parent)

        def build_ruledag(job, key=lambda job: job.rule.name):
            if job in visited:
                return
            visited.add(job)
            deps = sorted(self.dependencies[job], key=key)
            deps = [(group[0] if preselect.isdisjoint(group) else
                     preselect.intersection(group).pop())
                    for group in (list(g) for _, g in groupby(deps, key))]
            dag[job].extend(deps)
            preselect_parents(job)
            for dep in deps:
                build_ruledag(dep)

        for job in self.targetjobs:
            build_ruledag(job)

        return self._dot(dag.keys(),
                         print_wildcards=False,
                         print_types=False,
                         dag=dag)

    def rule_dot(self):
        graph = defaultdict(set)
        for job in self.jobs:
            graph[job.rule].update(dep.rule for dep in self.dependencies[job])
        return self._dot(graph)

    def dot(self):
        def node2style(job):
            if not self.needrun(job):
                return "rounded,dashed"
            if self.dynamic(job) or job.dynamic_input:
                return "rounded,dotted"
            return "rounded"

        def format_wildcard(wildcard):
            name, value = wildcard
            if _IOFile.dynamic_fill in value:
                value = "..."
            return "{}: {}".format(name, value)

        node2rule = lambda job: job.rule
        node2label = lambda job: "\\n".join(
            chain([job.rule.name],
                  sorted(map(format_wildcard, self.new_wildcards(job)))))

        dag = {job: self.dependencies[job] for job in self.jobs}

        return self._dot(dag,
                         node2rule=node2rule,
                         node2style=node2style,
                         node2label=node2label)

    def _dot(self,
             graph,
             node2rule=lambda node: node,
             node2style=lambda node: "rounded",
             node2label=lambda node: node):

        # color rules
        huefactor = 2 / (3 * len(self.rules))
        rulecolor = {
            rule: "{:.2f} 0.6 0.85".format(i * huefactor)
            for i, rule in enumerate(self.rules)
        }

        # markup
        node_markup = '\t{}[label = "{}", color = "{}", style="{}"];'.format
        edge_markup = "\t{} -> {}".format

        # node ids
        ids = {node: i for i, node in enumerate(graph)}

        # calculate nodes
        nodes = [
            node_markup(ids[node],
                        node2label(node), rulecolor[node2rule(node)],
                        node2style(node)) for node in graph
        ]
        # calculate edges
        edges = [
            edge_markup(ids[dep], ids[node]) for node, deps in graph.items()
            for dep in deps
        ]

        return textwrap.dedent("""\
            digraph snakemake_dag {{
                graph[bgcolor=white, margin=0];
                node[shape=box, style=rounded, fontname=sans, \
                fontsize=10, penwidth=2];
                edge[penwidth=2, color=grey];
            {items}
            }}\
            """).format(items="\n".join(nodes + edges))

    def summary(self, detailed=False):
        if detailed:
            yield "output_file\tdate\trule\tversion\tinput_file(s)\tshellcmd\tstatus\tplan"
        else:
            yield "output_file\tdate\trule\tversion\tstatus\tplan"

        for job in self.jobs:
            output = job.rule.output if self.dynamic(
                job) else job.expanded_output
            for f in output:
                rule = self.workflow.persistence.rule(f)
                rule = "-" if rule is None else rule

                version = self.workflow.persistence.version(f)
                version = "-" if version is None else str(version)

                date = time.ctime(f.mtime) if f.exists else "-"

                pending = "update pending" if self.reason(job) else "no update"

                input = self.workflow.persistence.input(f)
                input = "-" if input is None else ",".join(input)

                shellcmd = self.workflow.persistence.shellcmd(f)
                shellcmd = "-" if shellcmd is None else shellcmd
                # remove new line characters, leading and trailing whitespace
                shellcmd = shellcmd.strip().replace("\n", "; ")

                status = "ok"
                if not f.exists:
                    status = "missing"
                elif self.reason(job).updated_input:
                    status = "updated input files"
                elif self.workflow.persistence.version_changed(job, file=f):
                    status = "version changed to {}".format(job.rule.version)
                elif self.workflow.persistence.code_changed(job, file=f):
                    status = "rule implementation changed"
                elif self.workflow.persistence.input_changed(job, file=f):
                    status = "set of input files changed"
                elif self.workflow.persistence.params_changed(job, file=f):
                    status = "params changed"
                if detailed:
                    yield "\t".join((f, date, rule, version, input, shellcmd,
                                     status, pending))
                else:
                    yield "\t".join((f, date, rule, version, status, pending))

    def d3dag(self, max_jobs=10000):
        def node(job):
            jobid = self.jobid(job)
            return {
                "id": jobid,
                "value": {
                    "jobid": jobid,
                    "label": job.rule.name,
                    "rule": job.rule.name
                }
            }

        def edge(a, b):
            return {"u": self.jobid(a), "v": self.jobid(b)}

        jobs = list(self.jobs)

        if len(jobs) > max_jobs:
            logger.info(
                "Job-DAG is too large for visualization (>{} jobs).".format(
                    max_jobs))
        else:
            logger.d3dag(nodes=[node(job) for job in jobs],
                         edges=[
                             edge(dep, job) for job in jobs
                             for dep in self.dependencies[job]
                             if self.needrun(dep)
                         ])

    def stats(self):
        rules = Counter()
        rules.update(job.rule for job in self.needrun_jobs)
        rules.update(job.rule for job in self.finished_jobs)
        yield "Job counts:"
        yield "\tcount\tjobs"
        for rule, count in sorted(rules.most_common(),
                                  key=lambda item: item[0].name):
            yield "\t{}\t{}".format(count, rule)
        yield "\t{}".format(len(self))

    def __str__(self):
        return self.dot()

    def __len__(self):
        return self._len