コード例 #1
0
ファイル: dag.py プロジェクト: Kirill84/snakemake
 def check_output(self, job, wait=3):
     """ Raise exception if output files of job are missing. """
     for f in job.expanded_output:
         if not f.exists:
             logger.warning(
                 "Output file {} not present. Waiting {} "
                 "seconds to ensure that this is not because of filesystem "
                 "latency.".format(f, wait)
             )
             while not f.exists and wait > 0:
                 wait -= 1
                 time.sleep(1)
             if not f.exists:
                 raise MissingOutputException(
                     "Output file {} not " "produced by rule {}.".format(f, job.rule.name),
                     lineno=job.rule.lineno,
                     snakefile=job.rule.snakefile,
                 )
     input_maxtime = job.input_maxtime
     if input_maxtime is not None:
         output_mintime = job.output_mintime
         if output_mintime is not None and output_mintime < input_maxtime:
             raise RuleException(
                 "Output files {} are older than input "
                 "files. Did you extract an archive? Make sure that output "
                 "files have a more recent modification date than the "
                 "archive, e.g. by using 'touch'.".format(", ".join(job.expanded_output)),
                 rule=job.rule,
             )
コード例 #2
0
ファイル: jobs.py プロジェクト: vangalamaheshh/snakemake
    def prepare(self):
        """
        Prepare execution of job.
        This includes creation of directories and deletion of previously
        created dynamic files.
        """

        self.check_protected_output()

        unexpected_output = self.dag.reason(self).missing_output.intersection(
            self.existing_output)
        if unexpected_output:
            logger.warning(
                "Warning: the following output files of rule {} were not "
                "present when the DAG was created:\n{}".format(
                    self.rule, unexpected_output))

        if self.dynamic_output:
            for f, _ in chain(*map(partial(self.expand_dynamic,
                                           restriction=self.wildcards,
                                           omit_value=_IOFile.dynamic_fill),
                                   self.rule.dynamic_output)):
                os.remove(f)
        for f, f_ in zip(self.output, self.rule.output):
            f.prepare()
        for f in self.log:
            f.prepare()
        if self.benchmark:
            self.benchmark.prepare()
コード例 #3
0
ファイル: exceptions.py プロジェクト: Kirill84/snakemake
def print_exception(ex, linemaps, print_traceback=True):
    """
    Print an error message for a given exception.

    Arguments
    ex -- the exception
    linemaps -- a dict of a dict that maps for each snakefile
        the compiled lines to source code lines in the snakefile.
    """
    # traceback.print_exception(type(ex), ex, ex.__traceback__)
    origin = get_exception_origin(ex, linemaps)
    if origin is not None:
        lineno, file = origin
        logger.critical(format_error(ex, lineno, linemaps=linemaps, snakefile=file, show_traceback=print_traceback))
        return
    if isinstance(ex, SyntaxError):
        logger.critical(
            format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.filename, show_traceback=print_traceback)
        )
    elif isinstance(ex, TokenError):
        logger.critical(format_error(ex, None, show_traceback=print_traceback))
    elif isinstance(ex, RuleException):
        for e in ex._include + [ex]:
            if not e.omit:
                logger.critical(
                    format_error(e, e.lineno, linemaps=linemaps, snakefile=e.filename, show_traceback=print_traceback)
                )
    elif isinstance(ex, WorkflowError):
        logger.critical(
            format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.snakefile, show_traceback=print_traceback)
        )
    elif isinstance(ex, KeyboardInterrupt):
        logger.warning("Cancelling snakemake on user request.")
    else:
        traceback.print_exception(type(ex), ex, ex.__traceback__)
コード例 #4
0
ファイル: scheduler.py プロジェクト: Kirill84/snakemake
    def schedule(self):
        """ Schedule jobs that are ready, maximizing cpu usage. """
        while True:
            try:
                self._open_jobs.wait()
            except:
                # this will be caused because of SIGTERM or SIGINT
                self._executor.shutdown()
                return False
            self._open_jobs.clear()
            if not self.keepgoing and self._errors:
                logger.warning("Will exit after finishing "
                    "currently running jobs.")
                self._executor.shutdown()
                return False
            if not any(self.open_jobs):
                self._executor.shutdown()
                return not self._errors

            needrun = list(self.open_jobs)
            assert needrun

            logger.debug("Ready jobs:\n\t" + "\n\t".join(map(str, needrun)))

            run = self.job_selector(needrun)
            logger.debug("Selected jobs:\n\t" + "\n\t".join(map(str, run)))
            self.running.update(run)
            for job in run:
                self.run(job)
コード例 #5
0
ファイル: executors.py プロジェクト: Kirill84/snakemake
 def finish_job(self, job):
     super().finish_job(job)
     self.stats.report_job_end(job)
     try:
         self.workflow.persistence.finished(job)
     except IOError as e:
         logger.warning("Failed to remove marker file for job started "
             "({}). Please ensure write permissions for the "
             "directory {}".format(
                 e, self.workflow.persistence.path))
コード例 #6
0
ファイル: scheduler.py プロジェクト: Kirill84/snakemake
 def _error(self, job):
     """ Clear jobs and stop the workflow. """
     with self._lock:
         self._errors = True
         self.running.remove(job)
         self.failed.add(job)
         if self.keepgoing:
             logger.warning("Job failed, going on with independent jobs.")
         else:
             self._open_jobs.set()
コード例 #7
0
ファイル: executors.py プロジェクト: Kirill84/snakemake
 def _run(self, job, callback=None, error_callback=None):
     super()._run(job)
     self.stats.report_job_start(job)
     try:
         self.workflow.persistence.started(job)
     except IOError as e:
         logger.warning("Failed to set marker file for job started ({}). "
             "Snakemake will work, but cannot ensure that output files "
             "are complete in case of a kill signal or power loss. "
             "Please ensure write permissions for the "
             "directory {}".format(
                 e, self.workflow.persistence.path))
コード例 #8
0
ファイル: io.py プロジェクト: tianyabeef/gutMicrobiome
def lutime(f, times):
    #In some cases, we have a platform where os.supports_follow_symlink includes stat()
    #but not utime().  This leads to an anomaly.  In any case we never want to touch the
    #target of a link.
    if os.utime in os.supports_follow_symlinks:
        #...utime is well behaved
        return os.utime(f, times, follow_symlinks=False)
    elif not os.path.islink(f):
        #...symlinks not an issue here
        return os.utime(f, times)
    else:
        #...problem system.  Do nothing.
        logger.warning("Unable to set utime on symlink {}.  Your Python build does not support it.".format(f))
        return None
コード例 #9
0
ファイル: executors.py プロジェクト: Kirill84/snakemake
    def printjob(self, job):
        # skip dynamic jobs that will be "executed" only in dryrun mode
        if self.dag.dynamic(job):
            return

        def format_files(job, io, ruleio, dynamicio):
            for f in io:
                f_ = ruleio[f]
                if f in dynamicio:
                    yield "{} (dynamic)".format(f_)
                else:
                    yield f

        def format_ruleitem(name, value):
            return "" if not value else "\t{}: {}".format(name, value)

        desc = list()
        if not self.quiet:
            if job.message:
                desc.append(job.message)
            else:
                desc.append("{}rule {}:".format(self.rule_prefix(job), job.rule.name))
                for name, value in (
                    ("input", ", ".join(format_files(
                        job, job.input, job.ruleio, job.dynamic_input))),
                    ("output", ", ".join(format_files(
                        job, job.output, job.ruleio,
                        job.dynamic_output))),
                    ("log", job.log),
                    ("reason",
                        self.dag.reason(job) if self.printreason else None)):
                    if value:
                        desc.append(format_ruleitem(name, value))
                priority = self.dag.priority(job)
                if priority > 1:
                    desc.append(format_ruleitem(
                        "priority", "highest"
                        if priority == Job.HIGHEST_PRIORITY
                        else priority))
                if self.printthreads and job.threads > 1:
                    desc.append(format_ruleitem("threads", job.threads))
        if self.printshellcmds and job.shellcmd:
            desc.append(job.shellcmd)
        if desc:
            logger.info("\n".join(desc))
            if job.dynamic_output:
                logger.warning("Subsequent jobs will be added dynamically "
                    "depending on the output of this rule")
コード例 #10
0
ファイル: io.py プロジェクト: tianyabeef/gutMicrobiome
def remove(file, remove_non_empty_dir=False):
    if os.path.exists(file):
        if os.path.isdir(file):
            if remove_non_empty_dir:
                shutil.rmtree(file)
            else:
                try:
                    os.removedirs(file)
                except OSError as e:
                    # skip non empty directories
                    if e.errno == 39:
                        logger.info("Skipped removing empty directory {}".format(e.filename))
                    else:
                        logger.warning(str(e))
        else:
            os.remove(file)
コード例 #11
0
ファイル: dag.py プロジェクト: Kirill84/snakemake
    def finish(self, job, update_dynamic=True):
        self._finished.add(job)
        try:
            self._ready_jobs.remove(job)
        except KeyError:
            pass
        # mark depending jobs as ready
        for job_ in self.depending[job]:
            if self.needrun(job_) and self._ready(job_):
                self._ready_jobs.add(job_)

        if update_dynamic and job.dynamic_output:
            logger.warning("Dynamically updating jobs")
            newjob = self.update_dynamic(job)
            if newjob:
                # simulate that this job ran and was finished before
                self.omitforce.add(newjob)
                self._needrun.add(newjob)
                self._finished.add(newjob)

                self.postprocess()
                self.handle_protected(newjob)
コード例 #12
0
ファイル: dag.py プロジェクト: Kirill84/snakemake
    def handle_temp(self, job):
        """ Remove temp files if they are no longer needed. """
        if self.notemp:
            return

        needed = lambda job_, f: any(
            f in files
            for j, files in self.depending[job_].items()
            if not self.finished(j) and self.needrun(j) and j != job
        )

        def unneeded_files():
            for job_, files in self.dependencies[job].items():
                for f in job_.temp_output & files:
                    if not needed(job_, f):
                        yield f
            for f in filterfalse(partial(needed, job), job.temp_output):
                if not f in self.targetfiles:
                    yield f

        for f in unneeded_files():
            logger.warning("Removing temporary output file {}".format(f))
            f.remove()
コード例 #13
0
ファイル: dag.py プロジェクト: vangalamaheshh/snakemake
    def update_dynamic(self, job):
        dynamic_wildcards = job.dynamic_wildcards
        if not dynamic_wildcards:
            # this happens e.g. in dryrun if output is not yet present
            return

        depending = list(filter(lambda job_: not self.finished(job_),
                                self.bfs(self.depending, job)))
        newrule, non_dynamic_wildcards = job.rule.dynamic_branch(
            dynamic_wildcards,
            input=False)
        self.specialize_rule(job.rule, newrule)

        # no targetfile needed for job
        newjob = Job(newrule, self, format_wildcards=non_dynamic_wildcards)
        self.replace_job(job, newjob)
        for job_ in depending:
            if job_.dynamic_input:
                newrule_ = job_.rule.dynamic_branch(dynamic_wildcards)
                if newrule_ is not None:
                    self.specialize_rule(job_.rule, newrule_)
                    if not self.dynamic(job_):
                        logger.debug("Updating job {}.".format(job_))
                        newjob_ = Job(newrule_, self,
                                      targetfile=job_.targetfile)

                        unexpected_output = self.reason(
                            job_).missing_output.intersection(
                                newjob.existing_output)
                        if unexpected_output:
                            logger.warning(
                                "Warning: the following output files of rule {} were not "
                                "present when the DAG was created:\n{}".format(
                                    newjob_.rule, unexpected_output))

                        self.replace_job(job_, newjob_)
        return newjob
コード例 #14
0
    def __init__(
        self,
        workflow,
        dag,
        cores,
        local_cores=1,
        dryrun=False,
        touch=False,
        cluster=None,
        cluster_status=None,
        cluster_config=None,
        cluster_sync=None,
        drmaa=None,
        drmaa_log_dir=None,
        kubernetes=None,
        container_image=None,
        tibanna=None,
        tibanna_sfn=None,
        google_lifesciences=None,
        google_lifesciences_regions=None,
        google_lifesciences_location=None,
        google_lifesciences_cache=False,
        tes=None,
        precommand="",
        preemption_default=None,
        preemptible_rules=None,
        tibanna_config=False,
        jobname=None,
        quiet=False,
        printreason=False,
        printshellcmds=False,
        keepgoing=False,
        max_jobs_per_second=None,
        max_status_checks_per_second=100,
        latency_wait=3,
        greediness=1.0,
        force_use_threads=False,
        assume_shared_fs=True,
        keepincomplete=False,
        keepmetadata=True,
        scheduler_type=None,
        scheduler_ilp_solver=None,
    ):
        """ Create a new instance of KnapsackJobScheduler. """
        from ratelimiter import RateLimiter

        self.cluster = cluster
        self.cluster_config = cluster_config
        self.cluster_sync = cluster_sync
        self.dag = dag
        self.workflow = workflow
        self.dryrun = dryrun
        self.touch = touch
        self.quiet = quiet
        self.keepgoing = keepgoing
        self.running = set()
        self.failed = set()
        self.finished_jobs = 0
        self.greediness = 1
        self.max_jobs_per_second = max_jobs_per_second
        self.keepincomplete = keepincomplete
        self.keepmetadata = keepmetadata
        self.scheduler_type = scheduler_type
        self.scheduler_ilp_solver = scheduler_ilp_solver

        self.global_resources = {
            name: (sys.maxsize if res is None else res)
            for name, res in workflow.global_resources.items()
        }
        self.resources = dict(self.global_resources)

        use_threads = (
            force_use_threads
            or (os.name != "posix")
            or cluster
            or cluster_sync
            or drmaa
        )
        self._open_jobs = threading.Semaphore(0)
        self._lock = threading.Lock()

        self._errors = False
        self._finished = False
        self._job_queue = None
        self._submit_callback = self._noop
        self._finish_callback = partial(
            self._proceed,
            update_dynamic=not self.dryrun,
            print_progress=not self.quiet and not self.dryrun,
        )

        self._local_executor = None
        if dryrun:
            self._executor = DryrunExecutor(
                workflow,
                dag,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                latency_wait=latency_wait,
            )
        elif touch:
            self._executor = TouchExecutor(
                workflow,
                dag,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                latency_wait=latency_wait,
            )
        elif cluster or cluster_sync or (drmaa is not None):
            if not workflow.immediate_submit:
                # No local jobs when using immediate submit!
                # Otherwise, they will fail due to missing input
                self._local_executor = CPUExecutor(
                    workflow,
                    dag,
                    local_cores,
                    printreason=printreason,
                    quiet=quiet,
                    printshellcmds=printshellcmds,
                    latency_wait=latency_wait,
                    cores=local_cores,
                    keepincomplete=keepincomplete,
                    keepmetadata=keepmetadata,
                )
            if cluster or cluster_sync:
                if cluster_sync:
                    constructor = SynchronousClusterExecutor
                else:
                    constructor = partial(
                        GenericClusterExecutor,
                        statuscmd=cluster_status,
                        max_status_checks_per_second=max_status_checks_per_second,
                    )

                self._executor = constructor(
                    workflow,
                    dag,
                    None,
                    submitcmd=(cluster or cluster_sync),
                    cluster_config=cluster_config,
                    jobname=jobname,
                    printreason=printreason,
                    quiet=quiet,
                    printshellcmds=printshellcmds,
                    latency_wait=latency_wait,
                    assume_shared_fs=assume_shared_fs,
                    keepincomplete=keepincomplete,
                    keepmetadata=keepmetadata,
                )
                if workflow.immediate_submit:
                    self._submit_callback = partial(
                        self._proceed,
                        update_dynamic=False,
                        print_progress=False,
                        update_resources=False,
                        handle_job_success=False,
                    )
            else:
                self._executor = DRMAAExecutor(
                    workflow,
                    dag,
                    None,
                    drmaa_args=drmaa,
                    drmaa_log_dir=drmaa_log_dir,
                    jobname=jobname,
                    printreason=printreason,
                    quiet=quiet,
                    printshellcmds=printshellcmds,
                    latency_wait=latency_wait,
                    cluster_config=cluster_config,
                    assume_shared_fs=assume_shared_fs,
                    max_status_checks_per_second=max_status_checks_per_second,
                    keepincomplete=keepincomplete,
                    keepmetadata=keepmetadata,
                )
        elif kubernetes:
            self._local_executor = CPUExecutor(
                workflow,
                dag,
                local_cores,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                latency_wait=latency_wait,
                cores=local_cores,
                keepincomplete=keepincomplete,
                keepmetadata=keepmetadata,
            )

            self._executor = KubernetesExecutor(
                workflow,
                dag,
                kubernetes,
                container_image=container_image,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                latency_wait=latency_wait,
                cluster_config=cluster_config,
                keepincomplete=keepincomplete,
                keepmetadata=keepmetadata,
            )
        elif tibanna:
            self._local_executor = CPUExecutor(
                workflow,
                dag,
                local_cores,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                use_threads=use_threads,
                latency_wait=latency_wait,
                cores=local_cores,
                keepincomplete=keepincomplete,
                keepmetadata=keepmetadata,
            )

            self._executor = TibannaExecutor(
                workflow,
                dag,
                cores,
                tibanna_sfn,
                precommand=precommand,
                tibanna_config=tibanna_config,
                container_image=container_image,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                latency_wait=latency_wait,
                keepincomplete=keepincomplete,
                keepmetadata=keepmetadata,
            )
        elif google_lifesciences:
            self._local_executor = CPUExecutor(
                workflow,
                dag,
                local_cores,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                latency_wait=latency_wait,
                cores=local_cores,
            )

            self._executor = GoogleLifeSciencesExecutor(
                workflow,
                dag,
                cores,
                container_image=container_image,
                regions=google_lifesciences_regions,
                location=google_lifesciences_location,
                cache=google_lifesciences_cache,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                latency_wait=latency_wait,
                preemption_default=preemption_default,
                preemptible_rules=preemptible_rules,
            )
        elif tes:
            self._local_executor = CPUExecutor(
                workflow,
                dag,
                local_cores,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                latency_wait=latency_wait,
                cores=local_cores,
                keepincomplete=keepincomplete,
            )

            self._executor = TaskExecutionServiceExecutor(
                workflow,
                dag,
                cores=local_cores,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                latency_wait=latency_wait,
                tes_url=tes,
                container_image=container_image,
            )

        else:
            self._executor = CPUExecutor(
                workflow,
                dag,
                cores,
                printreason=printreason,
                quiet=quiet,
                printshellcmds=printshellcmds,
                use_threads=use_threads,
                latency_wait=latency_wait,
                cores=cores,
                keepincomplete=keepincomplete,
                keepmetadata=keepmetadata,
            )
        if self.max_jobs_per_second and not self.dryrun:
            max_jobs_frac = Fraction(self.max_jobs_per_second).limit_denominator()
            self.rate_limiter = RateLimiter(
                max_calls=max_jobs_frac.numerator, period=max_jobs_frac.denominator
            )

        else:
            # essentially no rate limit
            self.rate_limiter = DummyRateLimiter()

        # Choose job selector (greedy or ILP)
        self.job_selector = self.job_selector_greedy
        if scheduler_type == "ilp":
            import pulp

            if pulp.apis.LpSolverDefault is None:
                logger.warning(
                    "Falling back to greedy scheduler because no default "
                    "solver is found for pulp (you have to install either "
                    "coincbc or glpk)."
                )
            else:
                self.job_selector = self.job_selector_ilp

        self._user_kill = None
        try:
            signal.signal(signal.SIGTERM, self.exit_gracefully)
        except ValueError:
            # If this fails, it is due to scheduler not being invoked in the main thread.
            # This can only happen with --gui, in which case it is fine for now.
            pass
        self._open_jobs.release()
コード例 #15
0
    def __init__(
        self,
        path,
        job,
        caption,
        env,
        category,
        wildcards_overwrite=None,
        mode_embedded=True,
    ):
        self.mode_embedded = mode_embedded
        self.path = path
        self.target = os.path.basename(path)
        self.size = os.path.getsize(self.path)
        logger.info("Adding {} ({:.2g} MB).".format(self.name,
                                                    self.size / 1e6))
        self.raw_caption = caption
        self.mime, _ = mime_from_file(self.path)

        h = hashlib.sha256()
        h.update(path.encode())

        self.id = h.hexdigest()
        self.job = job
        self._wildcards = (job.wildcards if wildcards_overwrite is None else
                           wildcards_overwrite)
        self.wildcards = logging.format_wildcards(self._wildcards)
        self.params = (logging.format_dict(job.params).replace("\n",
                                                               r"\n").replace(
                                                                   '"', r"\""))
        self.category = category

        self.table_content = None
        if self.is_table:
            if self.size > 1e6:
                logger.warning(
                    "Table {} >1MB. Rendering as generic file.".format(
                        self.path))
            else:
                with open(self.path) as table:
                    dialect = None
                    for prefix in range(10, 17):
                        try:
                            table.seek(0)
                            dialect = csv.Sniffer().sniff(table.read(prefix))
                            break
                        except csv.Error:
                            pass
                        except UnicodeDecodeError:
                            # table is not readable as UTF-8
                            break
                    if dialect is None:
                        logger.warning(
                            "Failed to infer CSV/TSV dialect from table {}. "
                            "Rendering as generic file.".format(self.path))
                    else:
                        table.seek(0)
                        reader = csv.reader(table, dialect)
                        columns = next(reader)
                        table = map(
                            lambda row: list(map(num_if_possible, row)),
                            reader)
                        template = env.get_template("table.html")
                        html = template.render(columns=columns,
                                               table=table,
                                               name=self.name).encode()

                        self.table_content = html
                        self.mime = "text/html"
                        self.path = os.path.basename(self.path) + ".html"

        self.data_uri = self._data_uri()
        self.png_uri = self._png_uri()
コード例 #16
0
def auto_report(dag, path, stylesheet=None):
    try:
        from jinja2 import Template, Environment, PackageLoader
    except ImportError as e:
        raise WorkflowError(
            "Python package jinja2 must be installed to create reports.")

    mode_embedded = True
    if path.endswith(".zip"):
        mode_embedded = False
    elif not path.endswith(".html"):
        raise WorkflowError("Report file does not end with .html or .zip")

    custom_stylesheet = None
    if stylesheet is not None:
        try:
            with open(stylesheet) as s:
                custom_stylesheet = s.read()
        except (Exception, BaseException) as e:
            raise WorkflowError("Unable to read custom report stylesheet.", e)

    logger.info("Creating report...")

    env = Environment(
        loader=PackageLoader("snakemake", "report"),
        trim_blocks=True,
        lstrip_blocks=True,
    )
    env.filters["get_resource_as_string"] = get_resource_as_string

    persistence = dag.workflow.persistence
    results = defaultdict(lambda: defaultdict(list))
    records = defaultdict(JobRecord)
    recorded_files = set()
    for job in dag.jobs:
        for f in itertools.chain(job.expanded_output, job.input):
            if is_flagged(f, "report") and f not in recorded_files:
                if not f.exists:
                    raise WorkflowError("File {} marked for report but does "
                                        "not exist.".format(f))
                report_obj = get_flag_value(f, "report")

                def register_file(f, wildcards_overwrite=None):
                    wildcards = wildcards_overwrite or job.wildcards
                    category = Category(report_obj.category,
                                        wildcards=wildcards,
                                        job=job)
                    subcategory = Category(report_obj.subcategory,
                                           wildcards=wildcards,
                                           job=job)

                    results[category][subcategory].append(
                        FileRecord(
                            f,
                            job,
                            report_obj.caption,
                            env,
                            category,
                            wildcards_overwrite=wildcards_overwrite,
                            mode_embedded=mode_embedded,
                        ))
                    recorded_files.add(f)

                if os.path.isfile(f):
                    register_file(f)
                if os.path.isdir(f):
                    if not isinstance(report_obj.patterns, list):
                        raise WorkflowError(
                            "Invalid patterns given for report. Must be list.",
                            rule=job.rule,
                        )
                    if not report_obj.patterns:
                        raise WorkflowError(
                            "Directory marked for report but no file patterns given via patterns=[...]. "
                            "See report documentation.",
                            rule=job.rule,
                        )
                    for pattern in report_obj.patterns:
                        pattern = os.path.join(f, pattern)
                        wildcards = glob_wildcards(pattern)._asdict()
                        names = wildcards.keys()
                        for w in zip(*wildcards.values()):
                            w = dict(zip(names, w))
                            w.update(job.wildcards_dict)
                            w = Wildcards(fromdict=w)
                            f = apply_wildcards(pattern, w)
                            register_file(f, wildcards_overwrite=w)

        for f in job.expanded_output:
            meta = persistence.metadata(f)
            if not meta:
                logger.warning("Missing metadata for file {}. Maybe metadata "
                               "was deleted or it was created using an older "
                               "version of Snakemake. This is a non critical "
                               "warning.".format(f))
                continue
            try:
                job_hash = meta["job_hash"]
                rule = meta["rule"]
                rec = records[(job_hash, rule)]
                rec.rule = rule
                rec.job = job
                rec.starttime = min(rec.starttime, meta["starttime"])
                rec.endtime = max(rec.endtime, meta["endtime"])
                rec.conda_env_file = None
                rec.conda_env = meta["conda_env"]
                rec.container_img_url = meta["container_img_url"]
                rec.output.append(f)
            except KeyError as e:
                print(e)
                logger.warning("Metadata for file {} was created with a too "
                               "old Snakemake version.".format(f))

    for subcats in results.values():
        for catresults in subcats.values():
            catresults.sort(key=lambda res: res.name)

    # prepare runtimes
    runtimes = [{
        "rule": rec.rule,
        "runtime": rec.endtime - rec.starttime
    } for rec in sorted(records.values(), key=lambda rec: rec.rule)]

    # prepare end times
    timeline = [{
        "rule":
        rec.rule,
        "starttime":
        datetime.datetime.fromtimestamp(rec.starttime).isoformat(),
        "endtime":
        datetime.datetime.fromtimestamp(rec.endtime).isoformat(),
    } for rec in sorted(records.values(), key=lambda rec: rec.rule)]

    # prepare per-rule information
    rules = defaultdict(list)
    for rec in records.values():
        rule = RuleRecord(rec.job, rec)
        if rec.rule not in rules:
            rules[rec.rule].append(rule)
        else:
            merged = False
            for other in rules[rec.rule]:
                if rule == other:
                    other.add(rec)
                    merged = True
                    break
            if not merged:
                rules[rec.rule].append(rule)

    # rulegraph
    rulegraph, xmax, ymax = rulegraph_d3_spec(dag)

    # configfiles
    configfiles = [ConfigfileRecord(f) for f in dag.workflow.configfiles]

    seen = set()
    files = [
        seen.add(res.target) or res for cat in results.values()
        for subcat in cat.values() for res in subcat if res.target not in seen
    ]

    rst_links = textwrap.dedent("""

    .. _Workflow: javascript:show_panel('workflow')
    .. _Statistics: javascript:show_panel('statistics')
    {% for cat, catresults in categories|dictsort %}
    .. _{{ cat.name }}: javascript:show_panel("{{ cat.id }}")
    {% endfor %}
    {% for res in files %}
    .. _{{ res.target }}: javascript:show_panel("{{ res.category.id }}")
    {% endfor %}
    """)
    for cat, subcats in results.items():
        for subcat, catresults in subcats.items():
            for res in catresults:
                res.render(env, rst_links, results, files)

    # global description
    text = ""
    if dag.workflow.report_text:
        with open(dag.workflow.report_text) as f:

            class Snakemake:
                config = dag.workflow.config

            text = f.read() + rst_links
            text = publish_parts(
                env.from_string(text).render(snakemake=Snakemake,
                                             categories=results,
                                             files=files),
                writer_name="html",
            )["body"]

    # record time
    now = "{} {}".format(datetime.datetime.now().ctime(), time.tzname[0])
    results_size = sum(res.size for cat in results.values()
                       for subcat in cat.values() for res in subcat)

    try:
        from pygments.formatters import HtmlFormatter
    except ImportError:
        raise WorkflowError(
            "Python package pygments must be installed to create reports.")

    template = env.get_template("report.html")

    logger.info("Downloading resources and rendering HTML.")

    rendered = template.render(
        results=results,
        results_size=results_size,
        configfiles=configfiles,
        text=text,
        rulegraph_nodes=rulegraph["nodes"],
        rulegraph_links=rulegraph["links"],
        rulegraph_width=xmax + 20,
        rulegraph_height=ymax + 20,
        runtimes=runtimes,
        timeline=timeline,
        rules=[rec for recs in rules.values() for rec in recs],
        version=__version__,
        now=now,
        pygments_css=HtmlFormatter(style="trac").get_style_defs(".source"),
        custom_stylesheet=custom_stylesheet,
        mode_embedded=mode_embedded,
    )

    # TODO look into supporting .WARC format, also see (https://webrecorder.io)

    if not mode_embedded:
        with ZipFile(path, mode="w") as zipout:
            folder = Path(Path(path).stem)
            # store results in data folder
            for subcats in results.values():
                for catresults in subcats.values():
                    for result in catresults:
                        # write raw data
                        if result.table_content is not None:
                            zipout.writestr(
                                str(folder.joinpath(result.data_uri)),
                                result.table_content,
                            )
                        else:
                            zipout.write(result.path,
                                         str(folder.joinpath(result.data_uri)))
                        # write thumbnail
                        if result.is_img and result.png_content:
                            zipout.writestr(
                                str(folder.joinpath(result.png_uri)),
                                result.png_content)

            # write report html
            zipout.writestr(str(folder.joinpath("report.html")), rendered)
    else:
        with open(path, "w", encoding="utf-8") as htmlout:
            htmlout.write(rendered)

    logger.info("Report created: {}.".format(path))
コード例 #17
0
ファイル: workflow.py プロジェクト: Kirill84/snakemake
    def execute(
        self, targets=None, dryrun=False,  touch=False, cores=1,
        forcetargets=False, forceall=False, forcerun=None,
        prioritytargets=None, quiet=False, keepgoing=False,
        printshellcmds=False, printreason=False, printdag=False,
        cluster=None, immediate_submit=False, ignore_ambiguity=False,
        workdir=None, printrulegraph=False,
        stats=None, force_incomplete=False, ignore_incomplete=False,
        list_version_changes=False, list_code_changes=False,
        list_input_changes=False, list_params_changes=False,
        summary=False, output_wait=3, nolock=False, unlock=False,
        resources=None, notemp=False, nodeps=False,
        cleanup_metadata=None):

        self.global_resources = dict() if cluster or resources is None else resources
        self.global_resources["_cores"] = cores

        def rules(items):
            return map(self._rules.__getitem__, filter(self.is_rule, items))

        def files(items):
            return map(os.path.relpath, filterfalse(self.is_rule, items))

        if workdir is None:
            workdir = os.getcwd() if self._workdir is None else self._workdir
        os.chdir(workdir)

        if not targets:
            targets = [self.first_rule] if self.first_rule is not None else list()
        if prioritytargets is None:
            prioritytargets = list()
        if forcerun is None:
            forcerun = list()

        priorityrules = set(rules(prioritytargets))
        priorityfiles = set(files(prioritytargets))
        forcerules = set(rules(forcerun))
        forcefiles = set(files(forcerun))
        targetrules = set(chain(
            rules(targets), filterfalse(Rule.has_wildcards, priorityrules),
            filterfalse(Rule.has_wildcards, forcerules)))
        targetfiles = set(chain(files(targets), priorityfiles, forcefiles))
        if forcetargets:
            forcefiles.update(targetfiles)
            forcerules.update(targetrules)

        dag = DAG(
            self, dryrun=dryrun, targetfiles=targetfiles,
            targetrules=targetrules,
            forceall=forceall, forcefiles=forcefiles,
            forcerules=forcerules, priorityfiles=priorityfiles,
            priorityrules=priorityrules, ignore_ambiguity=ignore_ambiguity,
            force_incomplete=force_incomplete,
            ignore_incomplete=ignore_incomplete, notemp=notemp)

        self.persistence = Persistence(nolock=nolock, dag=dag)

        if cleanup_metadata:
            for f in cleanup_metadata:
                self.persistence.cleanup_metadata(f)
            return True

        dag.init()
        dag.check_dynamic()

        if unlock:
            try:
                self.persistence.cleanup_locks()
                logger.warning("Unlocking working directory.")
                return True
            except IOError:
                logger.error("Error: Unlocking the directory {} failed. Maybe "
                "you don't have the permissions?")
                return False
        try:
            self.persistence.lock()
        except IOError:
            logger.critical("Error: Directory cannot be locked. Please make "
                "sure that no other Snakemake process is trying to create "
                "the same files in the following directory:\n{}\n"
                "If you are sure that no other "
                "instances of snakemake are running on this directory, "
                "the remaining lock was likely caused by a kill signal or "
                "a power loss. It can be removed with "
                "the --unlock argument.".format(os.getcwd()))
            return False

        dag.check_incomplete()
        dag.postprocess()

        if nodeps:
            missing_input = [f for job in dag.targetjobs for f in job.input if dag.needrun(job) and not os.path.exists(f)]
            logger.critical("Dependency resolution disabled (--nodeps) "
                "but missing input " 
                "files detected. If this happens on a cluster, please make sure "
                "that you handle the dependencies yourself or turn of "
                "--immediate-submit. Missing input files:\n{}".format(
                    "\n".join(missing_input)))
            
            return False

        if printdag:
            print(dag)
            return True
        elif printrulegraph:
            print(dag.rule_dot())
            return True
        elif summary:
            print("\n".join(dag.summary()))
            return True
        elif list_version_changes:
            items = list(chain(
                *map(self.persistence.version_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_code_changes:
            items = list(chain(
                *map(self.persistence.code_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_input_changes:
            items = list(chain(
                *map(self.persistence.input_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_params_changes:
            items = list(chain(
                *map(self.persistence.params_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True

        scheduler = JobScheduler(
            self, dag, cores, dryrun=dryrun, touch=touch, cluster=cluster,
            immediate_submit=immediate_submit,
            quiet=quiet, keepgoing=keepgoing,
            printreason=printreason, printshellcmds=printshellcmds,
            output_wait=output_wait)

        if not dryrun and not quiet and len(dag):
            if cluster:
                logger.warning("Provided cluster nodes: {}".format(cores))
            else:
                logger.warning("Provided cores: {}".format(cores))
            logger.warning("\n".join(dag.stats()))

        success = scheduler.schedule()

        if success:
            if dryrun:
                if not quiet:
                    logger.warning("\n".join(dag.stats()))
            elif stats:
                scheduler.stats.to_csv(stats)
        else:
            logger.critical(
                "Exiting because a job execution failed. "
                "Look above for error message")
            return False
        return True
コード例 #18
0
ファイル: rules.py プロジェクト: kyleabeauchamp/mirrorsnake
    def _set_inoutput_item(self, item, output=False, name=None):
        """
        Set an item to be input or output.

        Arguments
        item     -- the item
        inoutput -- either a Namedlist of input or output items
        name     -- an optional name for the item
        """
        inoutput = self.output if output else self.input
        if isinstance(item, str):
            # add the rule to the dependencies
            if isinstance(item, _IOFile):
                self.dependencies[item] = item.rule
            if output:
                if self.wildcard_constraints or self.workflow._wildcard_constraints:
                    try:
                        item = update_wildcard_constraints(
                            item, self.wildcard_constraints,
                            self.workflow._wildcard_constraints)
                    except ValueError as e:
                        raise IOFileException(str(e),
                                              snakefile=self.snakefile,
                                              lineno=self.lineno)
            else:
                if contains_wildcard_constraints(item):
                    logger.warning(
                        "wildcard constraints in inputs are ignored")
            _item = IOFile(item, rule=self)
            if is_flagged(item, "temp"):
                if output:
                    self.temp_output.add(_item)
            if is_flagged(item, "protected"):
                if output:
                    self.protected_output.add(_item)
            if is_flagged(item, "touch"):
                if output:
                    self.touch_output.add(_item)
            if is_flagged(item, "dynamic"):
                if output:
                    self.dynamic_output.add(_item)
                else:
                    self.dynamic_input.add(_item)
            if is_flagged(item, "subworkflow"):
                if output:
                    raise SyntaxError(
                        "Only input files may refer to a subworkflow")
                else:
                    # record the workflow this item comes from
                    self.subworkflow_input[_item] = item.flags["subworkflow"]
            inoutput.append(_item)
            if name:
                inoutput.add_name(name)
        elif callable(item):
            if output:
                raise SyntaxError(
                    "Only input files can be specified as functions")
            inoutput.append(item)
            if name:
                inoutput.add_name(name)
        else:
            try:
                start = len(inoutput)
                for i in item:
                    self._set_inoutput_item(i, output=output)
                if name:
                    # if the list was named, make it accessible
                    inoutput.set_name(name, start, end=len(inoutput))
            except TypeError:
                raise SyntaxError(
                    "Input and output files have to be specified as strings or lists of strings."
                )
コード例 #19
0
ファイル: utils.py プロジェクト: shiltemann/snakemake
def validate(data, schema, set_default=True):
    """Validate data with JSON schema at given path.

    Args:
        data (object): data to validate. Can be a config dict or a pandas data frame.
        schema (str): Path to JSON schema used for validation. The schema can also be
            in YAML format. If validating a pandas data frame, the schema has to
            describe a row record (i.e., a dict with column names as keys pointing
            to row values). See http://json-schema.org. The path is interpreted
            relative to the Snakefile when this function is called.
        set_default (bool): set default values defined in schema. See
            http://python-jsonschema.readthedocs.io/en/latest/faq/ for more
            information
    """
    try:
        import jsonschema
        from jsonschema import validators, RefResolver
    except ImportError:
        raise WorkflowError(
            "The Python 3 package jsonschema must be installed "
            "in order to use the validate directive.")

    if not os.path.isabs(schema):
        frame = inspect.currentframe().f_back
        # if workflow object is not available this has not been started from a workflow
        if "workflow" in frame.f_globals:
            workflow = frame.f_globals["workflow"]
            schema = os.path.join(workflow.current_basedir, schema)

    schemafile = schema
    schema = _load_configfile(schema, filetype="Schema")
    resolver = RefResolver(
        urljoin("file:", schemafile),
        schema,
        handlers={
            "file": lambda uri: _load_configfile(re.sub("^file://", "", uri))
        },
    )

    # Taken from http://python-jsonschema.readthedocs.io/en/latest/faq/
    def extend_with_default(validator_class):
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for property, subschema in properties.items():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

            for error in validate_properties(validator, properties, instance,
                                             schema):
                yield error

        return validators.extend(validator_class, {"properties": set_defaults})

    Validator = validators.validator_for(schema)
    if Validator.META_SCHEMA["$schema"] != schema["$schema"]:
        logger.warning(
            "No validator found for JSON Schema version identifier '{}'".
            format(schema["$schema"]))
        logger.warning(
            "Defaulting to validator for JSON Schema version '{}'".format(
                Validator.META_SCHEMA["$schema"]))
        logger.warning("Note that schema file may not be validated correctly.")
    DefaultValidator = extend_with_default(Validator)

    if not isinstance(data, dict):
        try:
            import pandas as pd

            recordlist = []
            if isinstance(data, pd.DataFrame):
                for i, record in enumerate(data.to_dict("records")):
                    record = {
                        k: v
                        for k, v in record.items() if not pd.isnull(v)
                    }
                    try:
                        if set_default:
                            DefaultValidator(
                                schema, resolver=resolver).validate(record)
                            recordlist.append(record)
                        else:
                            jsonschema.validate(record,
                                                schema,
                                                resolver=resolver)
                    except jsonschema.exceptions.ValidationError as e:
                        raise WorkflowError(
                            "Error validating row {} of data frame.".format(i),
                            e)
                if set_default:
                    newdata = pd.DataFrame(recordlist, data.index)
                    newcol = ~newdata.columns.isin(data.columns)
                    n = len(data.columns)
                    for col in newdata.loc[:, newcol].columns:
                        data.insert(n, col, newdata.loc[:, col])
                        n = n + 1
                return
        except ImportError:
            pass
        raise WorkflowError("Unsupported data type for validation.")
    else:
        try:
            if set_default:
                DefaultValidator(schema, resolver=resolver).validate(data)
            else:
                jsonschema.validate(data, schema, resolver=resolver)
        except jsonschema.exceptions.ValidationError as e:
            raise WorkflowError("Error validating config file.", e)
コード例 #20
0
        def decorate(ruleinfo):
            if ruleinfo.wildcard_constraints:
                rule.set_wildcard_constraints(
                    *ruleinfo.wildcard_constraints[0],
                    **ruleinfo.wildcard_constraints[1])
            if ruleinfo.input:
                rule.set_input(*ruleinfo.input[0], **ruleinfo.input[1])
            if ruleinfo.output:
                rule.set_output(*ruleinfo.output[0], **ruleinfo.output[1])
            if ruleinfo.params:
                rule.set_params(*ruleinfo.params[0], **ruleinfo.params[1])
            # handle default resources
            if self.default_resources is not None:
                rule.resources = copy.deepcopy(self.default_resources.parsed)
            if ruleinfo.threads is not None:
                if (not isinstance(ruleinfo.threads, int)
                        and not isinstance(ruleinfo.threads, float)
                        and not callable(ruleinfo.threads)):
                    raise RuleException(
                        "Threads value has to be an integer, float, or a callable.",
                        rule=rule,
                    )
                if name in self.overwrite_threads:
                    rule.resources["_cores"] = self.overwrite_threads[name]
                else:
                    rule.resources["_cores"] = int(ruleinfo.threads)
            if ruleinfo.shadow_depth:
                if ruleinfo.shadow_depth not in (True, "shallow", "full",
                                                 "minimal"):
                    raise RuleException(
                        "Shadow must either be 'minimal', 'shallow', 'full', "
                        "or True (equivalent to 'full')",
                        rule=rule,
                    )
                if ruleinfo.shadow_depth is True:
                    rule.shadow_depth = "full"
                    logger.warning(
                        "Shadow is set to True in rule {} (equivalent to 'full'). It's encouraged to use the more explicit options 'minimal|shallow|full' instead."
                        .format(rule))
                else:
                    rule.shadow_depth = ruleinfo.shadow_depth
            if ruleinfo.resources:
                args, resources = ruleinfo.resources
                if args:
                    raise RuleException("Resources have to be named.")
                if not all(
                        map(lambda r: isinstance(r, int) or callable(r),
                            resources.values())):
                    raise RuleException(
                        "Resources values have to be integers or callables",
                        rule=rule)
                rule.resources.update(resources)
            if ruleinfo.priority:
                if not isinstance(ruleinfo.priority, int) and not isinstance(
                        ruleinfo.priority, float):
                    raise RuleException("Priority values have to be numeric.",
                                        rule=rule)
                rule.priority = ruleinfo.priority
            if ruleinfo.version:
                rule.version = ruleinfo.version
            if ruleinfo.log:
                rule.set_log(*ruleinfo.log[0], **ruleinfo.log[1])
            if ruleinfo.message:
                rule.message = ruleinfo.message
            if ruleinfo.benchmark:
                rule.benchmark = ruleinfo.benchmark
            if not self.run_local and ruleinfo.group is not None:
                rule.group = ruleinfo.group
            if ruleinfo.wrapper:
                if self.use_conda:
                    rule.conda_env = snakemake.wrapper.get_conda_env(
                        ruleinfo.wrapper, prefix=self.wrapper_prefix)
                # TODO retrieve suitable singularity image

            if self.use_env_modules and ruleinfo.env_modules:
                # If using environment modules and they are defined for the rule,
                # ignore conda and singularity directive below.
                # The reason is that this is likely intended in order to use
                # a software stack specifically compiled for a particular
                # HPC cluster.
                invalid_rule = not (ruleinfo.script or ruleinfo.wrapper
                                    or ruleinfo.shellcmd or ruleinfo.notebook)
                if invalid_rule:
                    raise RuleException(
                        "Modules directive is only allowed with "
                        "shell, script, notebook, or wrapper directives (not with run)",
                        rule=rule,
                    )
                from snakemake.deployment.env_modules import EnvModules

                rule.env_modules = EnvModules(*ruleinfo.env_modules)
            else:
                if ruleinfo.conda_env and self.use_conda:
                    if not (ruleinfo.script or ruleinfo.wrapper
                            or ruleinfo.shellcmd or ruleinfo.notebook):
                        raise RuleException(
                            "Conda environments are only allowed "
                            "with shell, script, notebook, or wrapper directives "
                            "(not with run).",
                            rule=rule,
                        )
                    if not (urllib.parse.urlparse(ruleinfo.conda_env).scheme
                            or os.path.isabs(ruleinfo.conda_env)):
                        ruleinfo.conda_env = os.path.join(
                            self.current_basedir, ruleinfo.conda_env)
                    rule.conda_env = ruleinfo.conda_env

                if self.use_singularity:
                    invalid_rule = not (ruleinfo.script or ruleinfo.wrapper or
                                        ruleinfo.shellcmd or ruleinfo.notebook)
                    if ruleinfo.singularity_img:
                        if invalid_rule:
                            raise RuleException(
                                "Singularity directive is only allowed "
                                "with shell, script, notebook or wrapper directives "
                                "(not with run).",
                                rule=rule,
                            )
                        rule.singularity_img = ruleinfo.singularity_img
                    elif self.global_singularity_img:
                        if not invalid_rule:
                            # skip rules with run directive
                            rule.singularity_img = self.global_singularity_img

            rule.norun = ruleinfo.norun
            rule.docstring = ruleinfo.docstring
            rule.run_func = ruleinfo.func
            rule.shellcmd = ruleinfo.shellcmd
            rule.script = ruleinfo.script
            rule.notebook = ruleinfo.notebook
            rule.wrapper = ruleinfo.wrapper
            rule.cwl = ruleinfo.cwl
            rule.restart_times = self.restart_times
            rule.basedir = self.current_basedir

            ruleinfo.func.__name__ = "__{}".format(rule.name)
            self.globals[ruleinfo.func.__name__] = ruleinfo.func
            setattr(rules, rule.name, RuleProxy(rule))
            if checkpoint:
                checkpoints.register(rule)
            return ruleinfo.func
コード例 #21
0
def auto_report(dag, path):
    try:
        from jinja2 import Template, Environment, PackageLoader
    except ImportError as e:
        raise WorkflowError(
            "Python package jinja2 must be installed to create reports."
        )

    if not path.endswith(".html"):
        raise WorkflowError("Report file does not end with .html")

    logger.info("Creating report...")

    env = Environment(
        loader=PackageLoader("snakemake", "report"),
        trim_blocks=True,
        lstrip_blocks=True,
    )
    env.filters["get_resource_as_string"] = get_resource_as_string

    persistence = dag.workflow.persistence
    results = defaultdict(list)
    records = defaultdict(JobRecord)
    recorded_files = set()
    for job in dag.jobs:
        for f in itertools.chain(job.expanded_output, job.input):
            if is_flagged(f, "report") and f not in recorded_files:
                if not f.exists:
                    raise WorkflowError(
                        "File {} marked for report but does " "not exist.".format(f)
                    )
                if os.path.isfile(f):
                    report_obj = get_flag_value(f, "report")
                    category = Category(report_obj.category)
                    results[category].append(
                        FileRecord(f, job, report_obj.caption, env, category)
                    )
                    recorded_files.add(f)

        for f in job.expanded_output:
            meta = persistence.metadata(f)
            if not meta:
                logger.warning(
                    "Missing metadata for file {}. Maybe metadata "
                    "was deleted or it was created using an older "
                    "version of Snakemake. This is a non critical "
                    "warning.".format(f)
                )
                continue
            try:
                job_hash = meta["job_hash"]
                rule = meta["rule"]
                rec = records[(job_hash, rule)]
                rec.rule = rule
                rec.job = job
                rec.starttime = min(rec.starttime, meta["starttime"])
                rec.endtime = max(rec.endtime, meta["endtime"])
                rec.conda_env_file = None
                rec.conda_env = meta["conda_env"]
                rec.singularity_img_url = meta["singularity_img_url"]
                rec.output.append(f)
            except KeyError as e:
                print(e)
                logger.warning(
                    "Metadata for file {} was created with a too "
                    "old Snakemake version.".format(f)
                )

    for catresults in results.values():
        catresults.sort(key=lambda res: res.name)

    # prepare runtimes
    runtimes = [
        {"rule": rec.rule, "runtime": rec.endtime - rec.starttime}
        for rec in sorted(records.values(), key=lambda rec: rec.rule)
    ]

    # prepare end times
    timeline = [
        {
            "rule": rec.rule,
            "starttime": datetime.datetime.fromtimestamp(rec.starttime).isoformat(),
            "endtime": datetime.datetime.fromtimestamp(rec.endtime).isoformat(),
        }
        for rec in sorted(records.values(), key=lambda rec: rec.rule)
    ]

    # prepare per-rule information
    rules = defaultdict(list)
    for rec in records.values():
        rule = RuleRecord(rec.job, rec)
        if rec.rule not in rules:
            rules[rec.rule].append(rule)
        else:
            merged = False
            for other in rules[rec.rule]:
                if rule == other:
                    other.add(rec)
                    merged = True
                    break
            if not merged:
                rules[rec.rule].append(rule)

    # rulegraph
    rulegraph, xmax, ymax = rulegraph_d3_spec(dag)

    # configfiles
    configfiles = [ConfigfileRecord(f) for f in dag.workflow.configfiles]

    seen = set()
    files = [
        seen.add(res.target) or res
        for cat in results.values()
        for res in cat
        if res.target not in seen
    ]

    rst_links = textwrap.dedent(
        """

    .. _Results: #results
    .. _Rules: #rules
    .. _Statistics: #stats
    {% for cat, catresults in categories|dictsort %}
    .. _{{ cat.name }}: #{{ cat.id }}
    {% for res in files %}
    .. _{{ res.target }}: #{{ res.id }}
    {% endfor %}
    {% endfor %}
    .. _
    """
    )
    for cat, catresults in results.items():
        for res in catresults:
            res.render(env, rst_links, results, files)

    # global description
    text = ""
    if dag.workflow.report_text:
        with open(dag.workflow.report_text) as f:

            class Snakemake:
                config = dag.workflow.config

            text = f.read() + rst_links
            text = publish_parts(
                env.from_string(text).render(
                    snakemake=Snakemake, categories=results, files=files
                ),
                writer_name="html",
            )["body"]

    # record time
    now = "{} {}".format(datetime.datetime.now().ctime(), time.tzname[0])
    results_size = sum(res.size for cat in results.values() for res in cat)

    try:
        from pygments.formatters import HtmlFormatter
    except ImportError:
        raise WorkflowError(
            "Python package pygments must be installed to create reports."
        )

    # render HTML
    template = env.get_template("report.html")
    with open(path, "w", encoding="utf-8") as out:
        out.write(
            template.render(
                results=results,
                results_size=results_size,
                configfiles=configfiles,
                text=text,
                rulegraph_nodes=rulegraph["nodes"],
                rulegraph_links=rulegraph["links"],
                rulegraph_width=xmax + 20,
                rulegraph_height=ymax + 20,
                runtimes=runtimes,
                timeline=timeline,
                rules=[rec for recs in rules.values() for rec in recs],
                version=__version__,
                now=now,
                pygments_css=HtmlFormatter(style="trac").get_style_defs(".source"),
            )
        )
    logger.info("Report created.")
コード例 #22
0
 def __init__(self, path, job, caption, env, category):
     self.path = path
     self.target = os.path.basename(path)
     self.size = os.path.getsize(self.path)
     logger.info("Adding {} ({:.2g} MB).".format(self.name, self.size / 1e6))
     self.raw_caption = caption
     self.mime, _ = mime_from_file(self.path)
     self.id = uuid.uuid4()
     self.job = job
     self.wildcards = logging.format_wildcards(job.wildcards)
     self.params = logging.format_dict(job.params)
     self.png_uri = None
     self.category = category
     if self.is_img:
         convert = shutil.which("convert")
         if convert is not None:
             try:
                 # 2048 aims at a reasonable balance between what displays
                 # can show in a png-preview image and what renders quick
                 # into a small enough png
                 max_width = "2048"
                 max_height = "2048"
                 # '>' means only larger images scaled down to within max-dimensions
                 max_spec = max_width + "x" + max_height + ">"
                 png = sp.check_output(
                     ["convert", "-resize", max_spec, self.path, "png:-"],
                     stderr=sp.PIPE,
                 )
                 uri = data_uri(
                     png, os.path.basename(self.path) + ".png", mime="image/png"
                 )
                 self.png_uri = uri
             except sp.CalledProcessError as e:
                 logger.warning(
                     "Failed to convert image to png with "
                     "imagemagick convert: {}".format(e.stderr)
                 )
         else:
             logger.warning(
                 "Command convert not in $PATH. Install "
                 "imagemagick in order to have embedded "
                 "images and pdfs in the report."
             )
     if self.is_table:
         if self.size > 1e6:
             logger.warning(
                 "Table {} >1MB. Rendering as generic file.".format(self.path)
             )
         else:
             with open(self.path) as table:
                 dialect = None
                 for prefix in range(10, 17):
                     try:
                         table.seek(0)
                         dialect = csv.Sniffer().sniff(table.read(prefix))
                         break
                     except csv.Error:
                         pass
                     except UnicodeDecodeError:
                         # table is not readable as UTF-8
                         break
                 if dialect is None:
                     logger.warning(
                         "Failed to infer CSV/TSV dialect from table {}. "
                         "Rendering as generic file.".format(self.path)
                     )
                 else:
                     table.seek(0)
                     reader = csv.reader(table, dialect)
                     columns = next(reader)
                     table = map(lambda row: list(map(num_if_possible, row)), reader)
                     template = env.get_template("table.html")
                     html = template.render(
                         columns=columns, table=table, name=self.name
                     ).encode()
                     self.mime = "text/html"
                     self.path = os.path.basename(self.path) + ".html"
                     self.data_uri = data_uri(html, self.path, mime=self.mime)
                     return
     # fallback
     self.data_uri = data_uri_from_file(path)
コード例 #23
0
    def fetch_from_ncbi(self,
                        accession_list,
                        destination_dir,
                        force_overwrite=False,
                        rettype="fasta",
                        retmode="text",
                        file_ext=None,
                        combined_file_prefix=None,
                        remove_separate_files=False,
                        chunk_size=1,
                        db="nuccore",
                        **kwargs):
        """
        This function downloads and saves files from NCBI.
        Adapted in part from the BSD-licensed code here:
          https://github.com/broadinstitute/viral-ngs/blob/master/util/genbank.py
        """

        max_chunk_size = 500

        # Conform to NCBI retreival guidelines by chunking into 500-accession chunks if
        # >500 accessions are specified and chunk_size is set to 1
        # Also clamp chunk size to 500 if the user specified a larger value.
        if chunk_size > max_chunk_size or (len(accession_list) > max_chunk_size
                                           and chunk_size == 1):
            chunk_size = max_chunk_size

        outEx = {"fasta": "fasta", "ft": "tbl", "gb": "gbk"}

        output_directory = os.path.abspath(os.path.expanduser(destination_dir))

        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

        output_extension = str(file_ext)

        # ensure the extension starts with a ".", also allowing for passed-in
        # extensions that already have it
        if output_extension[:1] != ".":
            output_extension = "." + output_extension

        logger.info("Fetching {} entries from NCBI: {}\n".format(
            str(len(accession_list)), ", ".join(accession_list[:10])))
        output_files = []

        for chunk_num, chunk in enumerate(
                self._seq_chunks(accession_list, chunk_size)):
            # sleep to throttle requests to 2 per second per NCBI guidelines:
            #   https://www.ncbi.nlm.nih.gov/books/NBK25497/#chapter2.Usage_Guidelines_and_Requiremen
            time.sleep(0.5)
            acc_string = ",".join(chunk)

            # if the filename would be longer than Linux allows, simply say "chunk-chunk_num"
            if len(acc_string) + len(output_extension) <= 254:
                output_file_path = os.path.join(output_directory,
                                                acc_string + output_extension)
            else:
                output_file_path = os.path.join(
                    output_directory,
                    "chunk-{}".format(chunk_num) + output_extension)

            if not force_overwrite:
                logger.info("not overwriting, checking for existence")
                assert not os.path.exists(output_file_path), (
                    """File %s already exists. Consider removing
                    this file or specifying a different output directory. The files for the accessions specified
                    can be overwritten if you add force_overwrite flag. Processing aborted."""
                    % output_file_path)

            try_count = 1
            while True:
                try:
                    logger.info("Fetching file {}: {}, try #{}".format(
                        chunk_num + 1, acc_string, try_count))
                    handle = self.entrez.efetch(db=db,
                                                rettype=rettype,
                                                retmode=retmode,
                                                id=acc_string,
                                                **kwargs)

                    with open(output_file_path, "w") as outf:
                        for line in handle:
                            outf.write(line)
                    output_files.append(output_file_path)
                except IOError:

                    logger.warning(
                        "Error fetching file {}: {}, try #{} probably because NCBI is too busy."
                        .format(chunk_num + 1, acc_string, try_count))

                    try_count += 1
                    if try_count > 4:
                        logger.warning("Tried too many times. Aborting.")
                        raise

                    # if the fetch failed, wait a few seconds and try again.
                    logger.info("Waiting and retrying...")
                    time.sleep(2)

                    continue
                break

        # assert that we are not trying to remove the intermediate files without writing a combined file
        if remove_separate_files:
            assert combined_file_prefix, """The intermediate files
                can only be removed if a combined file is written via combined_file_prefix"""

        # build a path to the combined genome file
        if combined_file_prefix:
            concatenated_genome_file_path = os.path.join(
                output_directory, combined_file_prefix + output_extension)

            if not force_overwrite:
                assert not os.path.exists(concatenated_genome_file_path), (
                    """File %s already exists. Consider removing
                    this file or specifying a different output directory. The files for the accessions specified
                    can be overwritten if you add force_overwrite flag. Processing aborted."""
                    % output_file_path)

            # concatenate the files together into one genome file
            with open(concatenated_genome_file_path, "w") as outfile:
                for file_path in output_files:
                    with open(file_path) as infile:
                        for line in infile:
                            outfile.write(line)

            # if the option is specified, remove the intermediate fasta files
            if remove_separate_files:
                while len(output_files) > 0:
                    os.unlink(output_files.pop())

            # add the combined file to the list of files returned
            output_files.append(concatenated_genome_file_path)

        # return list of files
        return output_files
コード例 #24
0
    def create(self, dryrun=False):
        """Create the conda enviroment."""
        from snakemake.shell import shell

        self.check_is_file_based()

        # Read env file and create hash.
        env_file = self.file
        deploy_file = None
        pin_file = None
        tmp_env_file = None
        tmp_deploy_file = None
        tmp_pin_file = None

        if not isinstance(env_file, LocalSourceFile) or isinstance(
                env_file, LocalGitFile):
            with tempfile.NamedTemporaryFile(delete=False,
                                             suffix=".yaml") as tmp:
                # write to temp file such that conda can open it
                tmp.write(self.content)
                env_file = tmp.name
                tmp_env_file = tmp.name
            if self.post_deploy_file:
                with tempfile.NamedTemporaryFile(
                        delete=False, suffix=".post-deploy.sh") as tmp:
                    # write to temp file such that conda can open it
                    tmp.write(self.content_deploy)
                    deploy_file = tmp.name
                    tmp_deploy_file = tmp.name
            if self.pin_file:
                with tempfile.NamedTemporaryFile(delete=False,
                                                 suffix="pin.txt") as tmp:
                    tmp.write(self.content_pin)
                    pin_file = tmp.name
                    tmp_pin_file = tmp.name
        else:
            env_file = env_file.get_path_or_uri()
            deploy_file = self.post_deploy_file
            pin_file = self.pin_file

        env_path = self.address

        if self.is_containerized:
            if not dryrun:
                try:
                    shell.check_output(
                        singularity.shellcmd(
                            self._container_img.path,
                            "[ -d '{}' ]".format(env_path),
                            args=self._singularity_args,
                            envvars=self.get_singularity_envvars(),
                            quiet=True,
                        ),
                        stderr=subprocess.PIPE,
                    )
                except subprocess.CalledProcessError as e:
                    raise WorkflowError(
                        "Unable to find environment in container image. "
                        "Maybe a conda environment was modified without containerizing again "
                        "(see snakemake --containerize)?\nDetails:\n{}\n{}".
                        format(e, e.stderr.decode()))
                return env_path
            else:
                # env should be present in the container
                return env_path

        # Check for broken environment
        if os.path.exists(os.path.join(
                env_path, "env_setup_start")) and not os.path.exists(
                    os.path.join(env_path, "env_setup_done")):
            if dryrun:
                logger.info(
                    "Incomplete Conda environment {} will be recreated.".
                    format(self.file.simplify_path()))
            else:
                logger.info(
                    "Removing incomplete Conda environment {}...".format(
                        self.file.simplify_path()))
                shutil.rmtree(env_path, ignore_errors=True)

        # Create environment if not already present.
        if not os.path.exists(env_path):
            if dryrun:
                logger.info("Conda environment {} will be created.".format(
                    self.file.simplify_path()))
                return env_path
            logger.info("Creating conda environment {}...".format(
                self.file.simplify_path()))
            env_archive = self.archive_file
            try:
                # Touch "start" flag file
                os.makedirs(env_path, exist_ok=True)
                with open(os.path.join(env_path, "env_setup_start"), "a") as f:
                    pass

                # Check if env archive exists. Use that if present.
                if os.path.exists(env_archive):
                    logger.info("Installing archived conda packages.")
                    pkg_list = os.path.join(env_archive, "packages.txt")
                    if os.path.exists(pkg_list):
                        # read pacakges in correct order
                        # this is for newer env archives where the package list
                        # was stored
                        packages = [
                            os.path.join(env_archive, pkg.rstrip())
                            for pkg in open(pkg_list)
                        ]
                    else:
                        # guess order
                        packages = glob(os.path.join(env_archive, "*.tar.bz2"))

                    # install packages manually from env archive
                    cmd = " ".join([
                        "conda",
                        "create",
                        "--quiet",
                        "--yes",
                        "--prefix '{}'".format(env_path),
                    ] + packages)
                    if self._container_img:
                        cmd = singularity.shellcmd(
                            self._container_img.path,
                            cmd,
                            args=self._singularity_args,
                            envvars=self.get_singularity_envvars(),
                        )
                    out = shell.check_output(cmd,
                                             stderr=subprocess.STDOUT,
                                             universal_newlines=True)
                else:

                    def create_env(env_file, filetype="yaml"):
                        # Copy env file to env_path (because they can be on
                        # different volumes and singularity should only mount one).
                        # In addition, this allows to immediately see what an
                        # environment in .snakemake/conda contains.
                        target_env_file = env_path + f".{filetype}"
                        shutil.copy(env_file, target_env_file)

                        logger.info(
                            "Downloading and installing remote packages.")

                        strict_priority = ([
                            "conda config --set channel_priority strict &&"
                        ] if self._container_img else [])

                        subcommand = [self.frontend]
                        yes_flag = ["--yes"]
                        if filetype == "yaml":
                            subcommand.append("env")
                            yes_flag = []

                        cmd = (strict_priority + subcommand + [
                            "create",
                            "--quiet",
                            '--file "{}"'.format(target_env_file),
                            '--prefix "{}"'.format(env_path),
                        ] + yes_flag)
                        cmd = " ".join(cmd)
                        if self._container_img:
                            cmd = singularity.shellcmd(
                                self._container_img.path,
                                cmd,
                                args=self._singularity_args,
                                envvars=self.get_singularity_envvars(),
                            )
                        out = shell.check_output(cmd,
                                                 stderr=subprocess.STDOUT,
                                                 universal_newlines=True)

                        # cleanup if requested
                        if self._cleanup is CondaCleanupMode.tarballs:
                            logger.info("Cleaning up conda package tarballs.")
                            shell.check_output("conda clean -y --tarballs")
                        elif self._cleanup is CondaCleanupMode.cache:
                            logger.info(
                                "Cleaning up conda package tarballs and package cache."
                            )
                            shell.check_output(
                                "conda clean -y --tarballs --packages")
                        return out

                    if pin_file is not None:
                        try:
                            logger.info(
                                f"Using pinnings from {self.pin_file.get_path_or_uri()}."
                            )
                            out = create_env(pin_file, filetype="pin.txt")
                        except subprocess.CalledProcessError as e:
                            # remove potential partially installed environment
                            shutil.rmtree(env_path, ignore_errors=True)
                            advice = ""
                            if isinstance(self.file, LocalSourceFile):
                                advice = (
                                    " If that works, make sure to update the pin file with "
                                    f"'snakedeploy pin-conda-env {self.file.get_path_or_uri()}'."
                                )
                            logger.warning(
                                f"Failed to install conda environment from pin file ({self.pin_file.get_path_or_uri()}). "
                                f"Trying regular environment definition file.{advice}"
                            )
                            out = create_env(env_file, filetype="yaml")
                    else:
                        out = create_env(env_file, filetype="yaml")

                # Execute post-deplay script if present
                if deploy_file:
                    target_deploy_file = env_path + ".post-deploy.sh"
                    shutil.copy(deploy_file, target_deploy_file)
                    self.execute_deployment_script(env_file,
                                                   target_deploy_file)

                # Touch "done" flag file
                with open(os.path.join(env_path, "env_setup_done"), "a") as f:
                    pass

                logger.debug(out)
                logger.info(
                    f"Environment for {self.file.get_path_or_uri()} created (location: {os.path.relpath(env_path)})"
                )
            except subprocess.CalledProcessError as e:
                # remove potential partially installed environment
                shutil.rmtree(env_path, ignore_errors=True)
                raise CreateCondaEnvironmentException(
                    f"Could not create conda environment from {env_file}:\nCommand:\n{e.cmd}\nOutput:\n{e.output}"
                )

        if tmp_env_file:
            # temporary file was created
            os.remove(tmp_env_file)
        if tmp_deploy_file:
            os.remove(tmp_deploy_file)

        return env_path
コード例 #25
0
ファイル: shell.py プロジェクト: yh154/snakemake
        if jobid is not None:
            with cls._lock:
                del cls._processes[jobid]

        if retcode:
            raise sp.CalledProcessError(retcode, cmd)
        return ret

    @staticmethod
    def iter_stdout(proc, cmd):
        for l in proc.stdout:
            yield l[:-1]
        retcode = proc.wait()
        if retcode:
            raise sp.CalledProcessError(retcode, cmd)


# set bash as default shell on posix compatible OS
if os.name == "posix":
    if not shutil.which("bash"):
        logger.warning("Cannot set bash as default shell because it is not "
                       "available in your PATH. Falling back to sh.")
        if not shutil.which("sh"):
            logger.warning("Cannot fall back to sh since it seems to be not "
                           "available on this system. Using whatever is "
                           "defined as default.")
        else:
            shell.executable("sh")
    else:
        shell.executable("bash")
コード例 #26
0
ファイル: rules.py プロジェクト: epruesse/snakemake
    def _set_inoutput_item(self, item, output=False, name=None):
        """
        Set an item to be input or output.

        Arguments
        item     -- the item
        inoutput -- a Namedlist of either input or output items
        name     -- an optional name for the item
        """
        inoutput = self.output if output else self.input

        # Check to see if the item is a path, if so, just make it a string
        if isinstance(item, Path):
            item = str(item)
        if isinstance(item, str):
            if ON_WINDOWS:
                if isinstance(item, (_IOFile, AnnotatedString)):
                    item = item.new_from(item.replace(os.sep, os.altsep))
                else:
                    item = item.replace(os.sep, os.altsep)

            rule_dependency = None
            if isinstance(item, _IOFile) and item.rule and item in item.rule.output:
                rule_dependency = item.rule

            item = self.apply_path_modifier(
                item, property="output" if output else "input"
            )

            # Check to see that all flags are valid
            # Note that "remote", "dynamic", and "expand" are valid for both inputs and outputs.
            if isinstance(item, AnnotatedString):
                for flag in item.flags:
                    if not output and flag in [
                        "protected",
                        "temp",
                        "temporary",
                        "directory",
                        "touch",
                        "pipe",
                    ]:
                        logger.warning(
                            "The flag '{}' used in rule {} is only valid for outputs, not inputs.".format(
                                flag, self
                            )
                        )
                    if output and flag in ["ancient"]:
                        logger.warning(
                            "The flag '{}' used in rule {} is only valid for inputs, not outputs.".format(
                                flag, self
                            )
                        )

            # add the rule to the dependencies
            if rule_dependency is not None:
                self.dependencies[item] = rule_dependency
            if output:
                item = self._update_item_wildcard_constraints(item)
            else:
                if (
                    contains_wildcard_constraints(item)
                    and self.workflow.mode != Mode.subprocess
                ):
                    logger.warning(
                        "Wildcard constraints in inputs are ignored. (rule: {})".format(
                            self
                        )
                    )

            if self.workflow.all_temp and output:
                # mark as temp if all output files shall be marked as temp
                item = snakemake.io.flag(item, "temp")

            # record rule if this is an output file output
            _item = IOFile(item, rule=self)

            if is_flagged(item, "temp"):
                if output:
                    self.temp_output.add(_item)
            if is_flagged(item, "protected"):
                if output:
                    self.protected_output.add(_item)
            if is_flagged(item, "touch"):
                if output:
                    self.touch_output.add(_item)
            if is_flagged(item, "dynamic"):
                if output:
                    self.dynamic_output.add(_item)
                else:
                    self.dynamic_input.add(_item)
            if is_flagged(item, "report"):
                report_obj = item.flags["report"]
                if report_obj.caption is not None:
                    r = ReportObject(
                        self.workflow.current_basedir.join(report_obj.caption),
                        report_obj.category,
                        report_obj.subcategory,
                        report_obj.patterns,
                        report_obj.htmlindex,
                    )
                    item.flags["report"] = r
            if is_flagged(item, "subworkflow"):
                if output:
                    raise SyntaxError("Only input files may refer to a subworkflow")
                else:
                    # record the workflow this item comes from
                    sub = item.flags["subworkflow"]
                    if _item in self.subworkflow_input:
                        other = self.subworkflow_input[_item]
                        if sub != other:
                            raise WorkflowError(
                                "The input file {} is ambiguously "
                                "associated with two subworkflows "
                                "{} and {}.".format(item, sub, other),
                                rule=self,
                            )
                    self.subworkflow_input[_item] = sub
            inoutput.append(_item)
            if name:
                inoutput._add_name(name)
        elif callable(item):
            if output:
                raise SyntaxError("Only input files can be specified as functions")
            inoutput.append(item)
            if name:
                inoutput._add_name(name)
        else:
            try:
                start = len(inoutput)
                for i in item:
                    self._set_inoutput_item(i, output=output)
                if name:
                    # if the list was named, make it accessible
                    inoutput._set_name(name, start, end=len(inoutput))
            except TypeError:
                raise SyntaxError(
                    "Input and output files have to be specified as strings or lists of strings."
                )
コード例 #27
0
# library preparation kit specific configuration
libprep_fn = srcdir("libprep.config")
with open(libprep_fn) as fh:
    LIBPREP_CONF = yaml.load(fh, Loader=Loader) or {}
kit = config.get("libprepkit")
if kit is not None:
    if len(config["read_geometry"]) > 1:
        kit += " PE"
    else:
        kit += " SE"
if kit in LIBPREP_CONF:
    # overwrite default config
    update_config(CONF, LIBPREP_CONF[kit])  
else:
    if kit is None:
        logger.warning("Running without LIBREPKIT defined!")
    else:
        logger.warning("`{}` is not a valid librepkit name".format(kit))
        sys.exit()

# update config (config.yaml). Does not update if key exists
update_config2(config, CONF)


# update read geometry with delta_readlen
if 'delta_readlen' in config and 'read_geometry' in config:
    read_geometry = config["read_geometry"]
    for i, val in enumerate(config['delta_readlen']):
        read_geometry[i] = int(read_geometry[i]) + int(val)
    config["read_geometry"] = read_geometry 
コード例 #28
0
    def job_selector_ilp(self, jobs):
        """
        Job scheduling by optimization of resource usage by solving ILP using pulp
        """
        import pulp
        from pulp import lpSum

        logger.info("Select jobs to execute...")

        # assert self.resources["_cores"] > 0
        scheduled_jobs = {
            job: pulp.LpVariable(
                "job_{}".format(idx),
                lowBound=0,
                upBound=1,
                cat=pulp.LpInteger,
            )
            for idx, job in enumerate(jobs)
        }

        size_gb = lambda f: f.size / 1e9

        temp_files = {
            temp_file for job in jobs for temp_file in self.dag.temp_input(job)
        }

        temp_job_improvement = {
            temp_file: pulp.LpVariable(
                "temp_file_{}".format(idx), lowBound=0, upBound=1, cat="Continuous"
            )
            for idx, temp_file in enumerate(temp_files)
        }

        temp_file_deletable = {
            temp_file: pulp.LpVariable(
                "deletable_{}".format(idx),
                lowBound=0,
                upBound=1,
                cat=pulp.LpInteger,
            )
            for idx, temp_file in enumerate(temp_files)
        }
        prob = pulp.LpProblem("JobScheduler", pulp.LpMaximize)

        total_temp_size = max(sum([size_gb(temp_file) for temp_file in temp_files]), 1)
        total_core_requirement = sum(
            [max(job.resources.get("_cores", 1), 1) for job in jobs]
        )
        # Objective function
        # Job priority > Core load
        # Core load > temp file removal
        # Instant removal > temp size
        prob += (
            2
            * total_core_requirement
            * 2
            * total_temp_size
            * lpSum([job.priority * scheduled_jobs[job] for job in jobs])
            + 2
            * total_temp_size
            * lpSum(
                [
                    max(job.resources.get("_cores", 1), 1) * scheduled_jobs[job]
                    for job in jobs
                ]
            )
            + total_temp_size
            * lpSum(
                [
                    temp_file_deletable[temp_file] * size_gb(temp_file)
                    for temp_file in temp_files
                ]
            )
            + lpSum(
                [
                    temp_job_improvement[temp_file] * size_gb(temp_file)
                    for temp_file in temp_files
                ]
            )
        )

        # Constraints:
        for name in self.workflow.global_resources:
            prob += (
                lpSum(
                    [scheduled_jobs[job] * job.resources.get(name, 0) for job in jobs]
                )
                <= self.resources[name]
            )

        # Choose jobs that lead to "fastest" (minimum steps) removal of existing temp file
        remaining_jobs = self.remaining_jobs
        for temp_file in temp_files:
            prob += temp_job_improvement[temp_file] <= lpSum(
                [
                    scheduled_jobs[job] * self.required_by_job(temp_file, job)
                    for job in jobs
                ]
            ) / lpSum([self.required_by_job(temp_file, job) for job in remaining_jobs])

            prob += temp_file_deletable[temp_file] <= temp_job_improvement[temp_file]

        solver = (
            pulp.get_solver(self.scheduler_ilp_solver)
            if self.scheduler_ilp_solver
            else pulp.apis.LpSolverDefault
        )
        solver.msg = self.workflow.verbose
        # disable extensive logging
        try:
            prob.solve(solver)
        except pulp.apis.core.PulpSolverError as e:
            logger.warning(
                "Failed to solve scheduling problem with ILP solver. Falling back to greedy solver. "
                "Run Snakemake with --verbose to see the full solver output for debugging the problem."
            )
            return self.job_selector_greedy(jobs)

        selected_jobs = set(
            job for job, variable in scheduled_jobs.items() if variable.value() == 1.0
        )

        if not selected_jobs:
            logger.warning(
                "Failed to solve scheduling problem with ILP solver. Falling back to greedy solver."
                "Run Snakemake with --verbose to see the full solver output for debugging the problem."
            )
            return self.job_selector_greedy(jobs)

        for name in self.workflow.global_resources:
            self.resources[name] -= sum(
                [job.resources.get(name, 0) for job in selected_jobs]
            )
        return selected_jobs
コード例 #29
0
 def check_localrules(self):
     undefined = self._localrules - set(rule.name for rule in self.rules)
     if undefined:
         logger.warning("localrules directive specifies rules that are not "
                        "present in the Snakefile:\n{}\n".format(
                            "\n".join(map("\t{}".format, undefined))))
コード例 #30
0
    def prepare(self):
        """
        Prepare execution of job.
        This includes creation of directories and deletion of previously
        created dynamic files.
        Creates a shadow directory for the job if specified.
        """

        self.check_protected_output()

        unexpected_output = self.dag.reason(self).missing_output.intersection(
            self.existing_output)
        if unexpected_output:
            logger.warning(
                "Warning: the following output files of rule {} were not "
                "present when the DAG was created:\n{}".format(
                    self.rule, unexpected_output))

        self.remove_existing_output()

        for f, f_ in zip(self.output, self.rule.output):
            f.prepare()

        self.download_remote_input()

        for f in self.log:
            f.prepare()
        if self.benchmark:
            self.benchmark.prepare()

        if not self.is_shadow:
            return

        # Create shadow directory structure
        self.shadow_dir = tempfile.mkdtemp(
            dir=self.rule.workflow.persistence.shadow_path)
        cwd = os.getcwd()

        if self.rule.shadow_depth == "minimal":
            # Re-create the directory structure in the shadow directory
            for (f,
                 d) in set([(item, os.path.dirname(item))
                            for sublist in [self.input, self.output, self.log]
                            if sublist is not None for item in sublist]):
                if d and not os.path.isabs(d):
                    rel_path = os.path.relpath(d)
                    # Only create subdirectories
                    if not rel_path.split(os.path.sep)[0] == "..":
                        os.makedirs(os.path.join(self.shadow_dir, rel_path),
                                    exist_ok=True)
                    else:
                        raise RuleException(
                            "The following file name references a parent directory relative to your workdir.\n"
                            "This isn't supported for shadow: \"minimal\". Consider using an absolute path instead.\n{}"
                            .format(f),
                            rule=self.rule)

            # Symlink the input files
            for rel_path in set([
                    os.path.relpath(f) for f in self.input
                    if not os.path.isabs(f)
            ]):
                link = os.path.join(self.shadow_dir, rel_path)
                original = os.path.relpath(rel_path, os.path.dirname(link))
                os.symlink(original, link)

        # Shallow simply symlink everything in the working directory.
        elif self.rule.shadow_depth == "shallow":
            for source in os.listdir(cwd):
                link = os.path.join(self.shadow_dir, source)
                os.symlink(os.path.abspath(source), link)
        elif self.rule.shadow_depth == "full":
            snakemake_dir = os.path.join(cwd, ".snakemake")
            for dirpath, dirnames, filenames in os.walk(cwd):
                # Must exclude .snakemake and its children to avoid infinite
                # loop of symlinks.
                if os.path.commonprefix([snakemake_dir,
                                         dirpath]) == snakemake_dir:
                    continue
                for dirname in dirnames:
                    if dirname == ".snakemake":
                        continue
                    relative_source = os.path.relpath(
                        os.path.join(dirpath, dirname))
                    shadow = os.path.join(self.shadow_dir, relative_source)
                    os.mkdir(shadow)

                for filename in filenames:
                    source = os.path.join(dirpath, filename)
                    relative_source = os.path.relpath(source)
                    link = os.path.join(self.shadow_dir, relative_source)
                    os.symlink(source, link)
コード例 #31
0
def script(
    path,
    basedir,
    input,
    output,
    params,
    wildcards,
    threads,
    resources,
    log,
    config,
    rulename,
    conda_env,
    singularity_img,
    singularity_args,
    bench_record,
    jobid,
    bench_iteration,
    shadow_dir,
):
    """
    Load a script from the given basedir + path and execute it.
    Supports Python 3 and R.
    """

    f = None
    try:
        path, source, language = get_source(path, basedir)
        if language == "python":
            wrapper_path = path[7:] if path.startswith("file://") else path
            snakemake = Snakemake(
                input,
                output,
                params,
                wildcards,
                threads,
                resources,
                log,
                config,
                rulename,
                bench_iteration,
                os.path.dirname(wrapper_path),
            )
            snakemake = pickle.dumps(snakemake)
            # Obtain search path for current snakemake module.
            # The module is needed for unpickling in the script.
            # We append it at the end (as a fallback).
            searchpath = SNAKEMAKE_SEARCHPATH
            if singularity_img is not None:
                searchpath = singularity.SNAKEMAKE_MOUNTPOINT
            searchpath = '"{}"'.format(searchpath)
            # For local scripts, add their location to the path in case they use path-based imports
            if path.startswith("file://"):
                searchpath += ', "{}"'.format(os.path.dirname(path[7:]))
            preamble = textwrap.dedent("""
            ######## Snakemake header ########
            import sys; sys.path.extend([{searchpath}]); import pickle; snakemake = pickle.loads({snakemake}); from snakemake.logging import logger; logger.printshellcmds = {printshellcmds}; __real_file__ = __file__; __file__ = {file_override};
            ######## Original script #########
            """).format(
                searchpath=escape_backslash(searchpath),
                snakemake=snakemake,
                printshellcmds=logger.printshellcmds,
                file_override=repr(os.path.realpath(wrapper_path)),
            )
        elif language == "r" or language == "rmarkdown":
            preamble = textwrap.dedent("""
            ######## Snakemake header ########
            library(methods)
            Snakemake <- setClass(
                "Snakemake",
                slots = c(
                    input = "list",
                    output = "list",
                    params = "list",
                    wildcards = "list",
                    threads = "numeric",
                    log = "list",
                    resources = "list",
                    config = "list",
                    rule = "character",
                    bench_iteration = "numeric",
                    scriptdir = "character",
                    source = "function"
                )
            )
            snakemake <- Snakemake(
                input = {},
                output = {},
                params = {},
                wildcards = {},
                threads = {},
                log = {},
                resources = {},
                config = {},
                rule = {},
                bench_iteration = {},
                scriptdir = {},
                source = function(...){{
                    wd <- getwd()
                    setwd(snakemake@scriptdir)
                    source(...)
                    setwd(wd)
                }}
            )

            ######## Original script #########
            """).format(
                REncoder.encode_namedlist(input),
                REncoder.encode_namedlist(output),
                REncoder.encode_namedlist(params),
                REncoder.encode_namedlist(wildcards),
                threads,
                REncoder.encode_namedlist(log),
                REncoder.encode_namedlist({
                    name: value
                    for name, value in resources.items()
                    if name != "_cores" and name != "_nodes"
                }),
                REncoder.encode_dict(config),
                REncoder.encode_value(rulename),
                REncoder.encode_numeric(bench_iteration),
                REncoder.encode_value(
                    os.path.dirname(path[7:]) if path.
                    startswith("file://") else os.path.dirname(path)),
            )
        elif language == "julia":
            preamble = textwrap.dedent("""
                    ######## Snakemake header ########
                    struct Snakemake
                        input::Dict
                        output::Dict
                        params::Dict
                        wildcards::Dict
                        threads::Int64
                        log::Dict
                        resources::Dict
                        config::Dict
                        rule::String
                        bench_iteration
                        scriptdir::String
                        #source::Any
                    end
                    snakemake = Snakemake(
                        {}, #input::Dict
                        {}, #output::Dict
                        {}, #params::Dict
                        {}, #wildcards::Dict
                        {}, #threads::Int64
                        {}, #log::Dict
                        {}, #resources::Dict
                        {}, #config::Dict
                        {}, #rule::String
                        {}, #bench_iteration::Int64
                        {}, #scriptdir::String
                        #, #source::Any
                    )
                    ######## Original script #########
                    """.format(
                JuliaEncoder.encode_namedlist(input),
                JuliaEncoder.encode_namedlist(output),
                JuliaEncoder.encode_namedlist(params),
                JuliaEncoder.encode_namedlist(wildcards),
                JuliaEncoder.encode_value(threads),
                JuliaEncoder.encode_namedlist(log),
                JuliaEncoder.encode_namedlist({
                    name: value
                    for name, value in resources.items()
                    if name != "_cores" and name != "_nodes"
                }),
                JuliaEncoder.encode_dict(config),
                JuliaEncoder.encode_value(rulename),
                JuliaEncoder.encode_value(bench_iteration),
                JuliaEncoder.encode_value(
                    os.path.dirname(path[7:]) if path.
                    startswith("file://") else os.path.dirname(path)),
            ).replace("'", '"'))
        else:
            raise ValueError(
                "Unsupported script: Expecting either Python (.py), R (.R), RMarkdown (.Rmd) or Julia (.jl) script."
            )

        dir = ".snakemake/scripts"
        os.makedirs(dir, exist_ok=True)

        with tempfile.NamedTemporaryFile(suffix="." + os.path.basename(path),
                                         dir=dir,
                                         delete=False) as f:
            if not language == "rmarkdown":
                f.write(preamble.encode())
                f.write(source)
            else:
                # Insert Snakemake object after the RMarkdown header
                code = source.decode()
                pos = next(islice(re.finditer(r"---\n", code), 1,
                                  2)).start() + 3
                f.write(str.encode(code[:pos]))
                preamble = textwrap.dedent("""
                    ```{r, echo=FALSE, message=FALSE, warning=FALSE}
                    %s
                    ```
                    """ % preamble)
                f.write(preamble.encode())
                f.write(str.encode(code[pos:]))

        if language == "python":
            py_exec = sys.executable
            if conda_env is not None:
                py = os.path.join(conda_env, "bin", "python")
                if os.path.exists(py):
                    out = subprocess.check_output(
                        [py, "--version"],
                        stderr=subprocess.STDOUT,
                        universal_newlines=True,
                    )
                    ver = tuple(
                        map(int,
                            PY_VER_RE.match(out).group("ver_min").split(".")))
                    if ver >= MIN_PY_VERSION:
                        # Python version is new enough, make use of environment
                        # to execute script
                        py_exec = "python"
                    else:
                        logger.warning(
                            "Conda environment defines Python "
                            "version < {0}.{1}. Using Python of the "
                            "master process to execute "
                            "script. Note that this cannot be avoided, "
                            "because the script uses data structures from "
                            "Snakemake which are Python >={0}.{1} "
                            "only.".format(*MIN_PY_VERSION))
            if singularity_img is not None:
                # use python from image
                py_exec = "python"
            # use the same Python as the running process or the one from the environment
            shell("{py_exec} {f.name:q}", bench_record=bench_record)
        elif language == "r":
            if conda_env is not None and "R_LIBS" in os.environ:
                logger.warning("R script job uses conda environment but "
                               "R_LIBS environment variable is set. This "
                               "is likely not intended, as R_LIBS can "
                               "interfere with R packages deployed via "
                               "conda. Consider running `unset R_LIBS` or "
                               "remove it entirely before executing "
                               "Snakemake.")
            shell("Rscript --vanilla {f.name:q}", bench_record=bench_record)
        elif language == "rmarkdown":
            if len(output) != 1:
                raise WorkflowError(
                    "RMarkdown scripts (.Rmd) may only have a single output file."
                )
            out = os.path.abspath(output[0])
            shell(
                'Rscript --vanilla -e \'rmarkdown::render("{f.name}", output_file="{out}", quiet=TRUE, knit_root_dir = "{workdir}", params = list(rmd="{f.name}"))\'',
                bench_record=bench_record,
                workdir=os.getcwd(),
            )
        elif language == "julia":
            shell("julia {f.name:q}", bench_record=bench_record)

    except URLError as e:
        raise WorkflowError(e)
    finally:
        if f:
            os.remove(f.name)
コード例 #32
0
ファイル: utils.py プロジェクト: ginger0106/snakemake
def report(text,
           path,
           stylesheet=os.path.join(os.path.dirname(__file__), "report.css"),
           defaultenc="utf8",
           template=None,
           metadata=None,
           **files):
    """
    Create an HTML report using python docutils.
    Attention: This function needs Python docutils to be installed for the
    python installation you use with Snakemake.

    Arguments
    text -- The "restructured text" as it is expected by python docutils.
    path -- The path to the desired output file
    stylesheet -- An optional path to a css file that defines the style of the
        document. This defaults to <your snakemake install>/report.css.
        Use the default to get a hint how to create your own.
    defaultenc -- The encoding that is reported to the browser for embedded
        text files, defaults to utf8.
    template -- An optional path to a docutils HTML template.
    metadata -- E.g. an optional author name or email address.

    All other keyword args are intepreted as paths to files that shall be
    embedded into the document. They keywords will be available as link
    targets in the text. E.g. append a file as keyword arg via F1=input[0]
    and put a download link in the text like this:

    report('''
    ==============
    Report for ...
    ==============

    Some text. A link to an embedded file: F1_.

    Further text.
    ''', outputpath, F1=input[0])

    Instead of specifying each file as a keyword arg, you can also expand
    the input of your rule if it is completely named, e.g.:

    report('''
    Some text...
    ''', outputpath, **input)

    """
    outmime, _ = mimetypes.guess_type(path)
    if outmime != "text/html":
        raise ValueError("Path to report output has to be an HTML file.")
    from docutils.core import publish_file
    definitions = textwrap.dedent("""
    .. role:: raw-html(raw)
       :format: html

    """)

    metadata = textwrap.dedent("""

    .. container::
       :name: metadata

       {metadata} {date}

    """).format(metadata=metadata, date=datetime.date.today().isoformat())

    text = format(textwrap.dedent(text), stepout=2)

    attachments = [
        textwrap.dedent("""
        .. container::
           :name: attachments
           
        """)
    ]
    for name, file in sorted(files.items()):
        mime, encoding = mimetypes.guess_type(file)
        if mime is None:
            mime = "text/plain"
            logger.warning("Could not detect mimetype for {}, assuming "
                           "text/plain.".format(file))
        if encoding is None:
            encoding = defaultenc
        with open(file, "rb") as f:
            data = base64.b64encode(f.read())
        attachments.append('''
   .. container::
      :name: {name}

      [{name}] :raw-html:`<a href="data:{mime};charset={charset};filename={filename};base64,{data}" download="{filename}" draggable="true">{filename}</a>`
            '''.format(name=name,
                       filename=os.path.basename(file),
                       mime=mime,
                       charset=encoding,
                       data=data.decode()))

    text = definitions + text + "\n\n" + "\n\n".join(attachments) + metadata

    overrides = dict()
    if template is not None:
        overrides["template"] = template
    if stylesheet is not None:
        overrides["stylesheet_path"] = stylesheet
    html = open(path, "w")
    publish_file(source=io.StringIO(text),
                 destination=html,
                 writer_name="html",
                 settings_overrides=overrides)
コード例 #33
0
        def decorate(ruleinfo):
            if ruleinfo.wildcard_constraints:
                rule.set_wildcard_constraints(
                    *ruleinfo.wildcard_constraints[0],
                    **ruleinfo.wildcard_constraints[1])
            if ruleinfo.input:
                rule.set_input(*ruleinfo.input[0], **ruleinfo.input[1])
            if ruleinfo.output:
                rule.set_output(*ruleinfo.output[0], **ruleinfo.output[1])
            if ruleinfo.params:
                rule.set_params(*ruleinfo.params[0], **ruleinfo.params[1])
            if ruleinfo.threads:
                if not isinstance(ruleinfo.threads, int) and not callable(
                        ruleinfo.threads):
                    raise RuleException(
                        "Threads value has to be an integer or a callable.",
                        rule=rule)
                rule.resources["_cores"] = ruleinfo.threads
            if ruleinfo.shadow_depth:
                if ruleinfo.shadow_depth not in (True, "shallow", "full",
                                                 "minimal"):
                    raise RuleException(
                        "Shadow must either be 'minimal', 'shallow', 'full', "
                        "or True (equivalent to 'full')",
                        rule=rule)
                if ruleinfo.shadow_depth is True:
                    rule.shadow_depth = 'full'
                    logger.warning(
                        "Shadow is set to True in rule {} (equivalent to 'full'). It's encouraged to use the more explicit options 'minimal|shallow|full' instead."
                        .format(rule))
                else:
                    rule.shadow_depth = ruleinfo.shadow_depth
            if ruleinfo.resources:
                args, resources = ruleinfo.resources
                if args:
                    raise RuleException("Resources have to be named.")
                if not all(
                        map(lambda r: isinstance(r, int) or callable(r),
                            resources.values())):
                    raise RuleException(
                        "Resources values have to be integers or callables",
                        rule=rule)
                rule.resources.update(resources)
            if ruleinfo.priority:
                if (not isinstance(ruleinfo.priority, int)
                        and not isinstance(ruleinfo.priority, float)):
                    raise RuleException("Priority values have to be numeric.",
                                        rule=rule)
                rule.priority = ruleinfo.priority
            if ruleinfo.version:
                rule.version = ruleinfo.version
            if ruleinfo.log:
                rule.set_log(*ruleinfo.log[0], **ruleinfo.log[1])
            if ruleinfo.message:
                rule.message = ruleinfo.message
            if ruleinfo.benchmark:
                rule.benchmark = ruleinfo.benchmark
            if not self.run_local and ruleinfo.group is not None:
                rule.group = ruleinfo.group
            if ruleinfo.wrapper:
                if self.use_conda:
                    rule.conda_env = snakemake.wrapper.get_conda_env(
                        ruleinfo.wrapper, prefix=self.wrapper_prefix)
                # TODO retrieve suitable singularity image

            if ruleinfo.conda_env and self.use_conda:
                if not (ruleinfo.script or ruleinfo.wrapper
                        or ruleinfo.shellcmd):
                    raise RuleException(
                        "Conda environments are only allowed "
                        "with shell, script, or wrapper directives "
                        "(not with run).",
                        rule=rule)
                if not (urllib.parse.urlparse(ruleinfo.conda_env).scheme
                        or os.path.isabs(ruleinfo.conda_env)):
                    ruleinfo.conda_env = os.path.join(self.current_basedir,
                                                      ruleinfo.conda_env)
                rule.conda_env = ruleinfo.conda_env

            if self.use_singularity:
                invalid_rule = not (ruleinfo.script or ruleinfo.wrapper
                                    or ruleinfo.shellcmd)
                if ruleinfo.singularity_img:
                    if invalid_rule:
                        raise RuleException(
                            "Singularity directive is only allowed "
                            "with shell, script or wrapper directives "
                            "(not with run).",
                            rule=rule)
                    rule.singularity_img = ruleinfo.singularity_img
                elif self.global_singularity_img:
                    if not invalid_rule:
                        # skip rules with run directive
                        rule.singularity_img = self.global_singularity_img

            rule.norun = ruleinfo.norun
            rule.docstring = ruleinfo.docstring
            rule.run_func = ruleinfo.func
            rule.shellcmd = ruleinfo.shellcmd
            rule.script = ruleinfo.script
            rule.wrapper = ruleinfo.wrapper
            rule.cwl = ruleinfo.cwl
            rule.restart_times = self.restart_times

            ruleinfo.func.__name__ = "__{}".format(rule.name)
            self.globals[ruleinfo.func.__name__] = ruleinfo.func
            setattr(rules, rule.name, RuleProxy(rule))
            if checkpoint:
                checkpoints.register(rule)
            return ruleinfo.func
コード例 #34
0
ファイル: jobs.py プロジェクト: tianyabeef/gutMicrobiome
    def prepare(self):
        """
        Prepare execution of job.
        This includes creation of directories and deletion of previously
        created dynamic files.
        Creates a shadow directory for the job if specified.
        """

        self.check_protected_output()

        unexpected_output = self.dag.reason(self).missing_output.intersection(
            self.existing_output)
        if unexpected_output:
            logger.warning(
                "Warning: the following output files of rule {} were not "
                "present when the DAG was created:\n{}".format(
                    self.rule, unexpected_output))

        for f, f_ in zip(self.output, self.rule.output):
            f.prepare()

        for f in self.files_to_download:
            f.download_from_remote()

        for f in self.log:
            f.prepare()
        if self.benchmark:
            self.benchmark.prepare()

        self.remove_existing_output()

        if not self.is_shadow:
            return
        # Create shadow directory structure
        self.shadow_dir = tempfile.mkdtemp(
            dir=self.rule.workflow.persistence.shadow_path)
        cwd = os.getcwd()
        # Shallow simply symlink everything in the working directory.
        if self.rule.shadow_depth == "shallow":
            for source in os.listdir(cwd):
                link = os.path.join(self.shadow_dir, source)
                os.symlink(os.path.abspath(source), link)
        elif self.rule.shadow_depth == "full":
            snakemake_dir = os.path.join(cwd, ".snakemake")
            for dirpath, dirnames, filenames in os.walk(cwd):
                # Must exclude .snakemake and its children to avoid infinite
                # loop of symlinks.
                if os.path.commonprefix([snakemake_dir, dirpath
                                         ]) == snakemake_dir:
                    continue
                for dirname in dirnames:
                    if dirname == ".snakemake":
                        continue
                    relative_source = os.path.relpath(os.path.join(dirpath,
                                                                   dirname))
                    shadow = os.path.join(self.shadow_dir, relative_source)
                    os.mkdir(shadow)

                for filename in filenames:
                    source = os.path.join(dirpath, filename)
                    relative_source = os.path.relpath(source)
                    link = os.path.join(self.shadow_dir, relative_source)
                    os.symlink(source, link)
コード例 #35
0
ファイル: scheduler.py プロジェクト: epruesse/snakemake
    def job_selector_ilp(self, jobs):
        """
        Job scheduling by optimization of resource usage by solving ILP using pulp
        """
        import pulp
        from pulp import lpSum
        from stopit import ThreadingTimeout as Timeout, TimeoutException

        if len(jobs) == 1:
            logger.debug(
                "Using greedy selector because only single job has to be scheduled."
            )
            return self.job_selector_greedy(jobs)

        with self._lock:
            if not self.resources["_cores"]:
                return set()

            # assert self.resources["_cores"] > 0
            scheduled_jobs = {
                job: pulp.LpVariable(
                    "job_{}".format(idx),
                    lowBound=0,
                    upBound=1,
                    cat=pulp.LpInteger,
                )
                for idx, job in enumerate(jobs)
            }

            def size_gb(f):
                if self.touch:
                    # In case of touch mode, there is no need to prioritize based on size.
                    # We cannot access it anyway, because the files might be temporary and
                    # not present.
                    return 0
                else:
                    return f.size / 1e9

            temp_files = {
                temp_file
                for job in jobs for temp_file in self.dag.temp_input(job)
            }

            temp_job_improvement = {
                temp_file: pulp.LpVariable("temp_file_{}".format(idx),
                                           lowBound=0,
                                           upBound=1,
                                           cat="Continuous")
                for idx, temp_file in enumerate(temp_files)
            }

            temp_file_deletable = {
                temp_file: pulp.LpVariable(
                    "deletable_{}".format(idx),
                    lowBound=0,
                    upBound=1,
                    cat=pulp.LpInteger,
                )
                for idx, temp_file in enumerate(temp_files)
            }
            prob = pulp.LpProblem("JobScheduler", pulp.LpMaximize)

            total_temp_size = max(
                sum([size_gb(temp_file) for temp_file in temp_files]), 1)
            total_core_requirement = sum(
                [max(job.resources.get("_cores", 1), 1) for job in jobs])
            # Objective function
            # Job priority > Core load
            # Core load > temp file removal
            # Instant removal > temp size
            prob += (2 * total_core_requirement * 2 * total_temp_size * lpSum([
                job.priority * scheduled_jobs[job] for job in jobs
            ]) + 2 * total_temp_size * lpSum([
                max(job.resources.get("_cores", 1), 1) * scheduled_jobs[job]
                for job in jobs
            ]) + total_temp_size * lpSum([
                temp_file_deletable[temp_file] * size_gb(temp_file)
                for temp_file in temp_files
            ]) + lpSum([
                temp_job_improvement[temp_file] * size_gb(temp_file)
                for temp_file in temp_files
            ]))

            # Constraints:
            for name in self.workflow.global_resources:
                prob += (lpSum([
                    scheduled_jobs[job] * job.resources.get(name, 0)
                    for job in jobs
                ]) <= self.resources[name])

            # Choose jobs that lead to "fastest" (minimum steps) removal of existing temp file
            remaining_jobs = self.remaining_jobs
            for temp_file in temp_files:
                prob += temp_job_improvement[temp_file] <= lpSum([
                    scheduled_jobs[job] * self.required_by_job(temp_file, job)
                    for job in jobs
                ]) / lpSum([
                    self.required_by_job(temp_file, job)
                    for job in remaining_jobs
                ])

                prob += (temp_file_deletable[temp_file] <=
                         temp_job_improvement[temp_file])

        try:
            with Timeout(10, swallow_exc=False):
                self._solve_ilp(prob)
        except TimeoutException as e:
            logger.warning(
                "Failed to solve scheduling problem with ILP solver in time (10s). "
                "Falling back to greedy solver.")
            return self.job_selector_greedy(jobs)
        except pulp.apis.core.PulpSolverError as e:
            logger.warning(
                "Failed to solve scheduling problem with ILP solver. Falling back to greedy solver. "
                "Run Snakemake with --verbose to see the full solver output for debugging the problem."
            )
            return self.job_selector_greedy(jobs)

        selected_jobs = set(job for job, variable in scheduled_jobs.items()
                            if variable.value() == 1.0)

        if not selected_jobs:
            # No selected jobs. This could be due to insufficient resources or a failure in the ILP solver
            # Hence, we silently fall back to the greedy solver to make sure that we don't miss anything.
            return self.job_selector_greedy(jobs)

        for name in self.workflow.global_resources:
            self.resources[name] -= sum(
                [job.resources.get(name, 0) for job in selected_jobs])
        return selected_jobs
コード例 #36
0
ファイル: dag.py プロジェクト: Kirill84/snakemake
 def handle_protected(self, job):
     """ Write-protect output files that are marked with protected(). """
     for f in job.expanded_output:
         if f in job.protected_output:
             logger.warning("Write-protecting output file {}".format(f))
             f.protect()
コード例 #37
0
ファイル: utils.py プロジェクト: Kirill84/snakemake
def report(
    text, path,
    stylesheet=os.path.join(os.path.dirname(__file__), "report.css"),
    defaultenc="utf8", template=None, metadata=None, **files):
    """
    Create an HTML report using python docutils.
    Attention: This function needs Python docutils to be installed for the
    python installation you use with Snakemake.

    Arguments
    text -- The "restructured text" as it is expected by python docutils.
    path -- The path to the desired output file
    stylesheet -- An optional path to a css file that defines the style of the
        document. This defaults to <your snakemake install>/report.css.
        Use the default to get a hint how to create your own.
    defaultenc -- The encoding that is reported to the browser for embedded
        text files, defaults to utf8.
    template -- An optional path to a docutils HTML template.
    metadata -- E.g. an optional author name or email address.

    All other keyword args are intepreted as paths to files that shall be
    embedded into the document. They keywords will be available as link
    targets in the text. E.g. append a file as keyword arg via F1=input[0]
    and put a download link in the text like this:

    report('''
    ==============
    Report for ...
    ==============

    Some text. A link to an embedded file: F1_.

    Further text.
    ''', outputpath, F1=input[0])

    Instead of specifying each file as a keyword arg, you can also expand
    the input of your rule if it is completely named, e.g.:

    report('''
    Some text...
    ''', outputpath, **input)

    """
    outmime, _ = mimetypes.guess_type(path)
    if outmime != "text/html":
        raise ValueError("Path to report output has to be an HTML file.")
    from docutils.core import publish_file
    definitions = textwrap.dedent("""
    .. role:: raw-html(raw)
       :format: html

    """)

    metadata = textwrap.dedent("""

    .. container::
       :name: metadata

       {metadata} {date}

    """).format(metadata=metadata, date=datetime.date.today().isoformat())

    text = format(textwrap.dedent(text), stepout=2)

    attachments = [textwrap.dedent("""
        .. container::
           :name: attachments
           
        """)]
    for name, file in sorted(files.items()):
        mime, encoding = mimetypes.guess_type(file)
        if mime is None:
            mime = "text/plain"
            logger.warning("Could not detect mimetype for {}, assuming "
            "text/plain.".format(file))
        if encoding is None:
            encoding = defaultenc
        with open(file, "rb") as f:
            data = base64.b64encode(f.read())
        attachments.append(
            '''
   .. container::
      :name: {name}

      [{name}] :raw-html:`<a href="data:{mime};charset={charset};filename={filename};base64,{data}" download="{filename}" draggable="true">{filename}</a>`
            '''.format(
                name=name,
                filename=os.path.basename(file),
                mime=mime,
                charset=encoding,
                data=data.decode()))

    text = definitions + text + "\n\n" + "\n\n".join(attachments) + metadata

    overrides = dict()
    if template is not None:
        overrides["template"] = template
    if stylesheet is not None:
        overrides["stylesheet_path"] = stylesheet
    html = open(path, "w")
    publish_file(
        source=io.StringIO(text), destination=html,
        writer_name="html", settings_overrides=overrides)
コード例 #38
0
ファイル: __init__.py プロジェクト: Kirill84/snakemake
def snakemake(snakefile,
    listrules=False,
    cores=1,
    resources=None,
    workdir=None,
    targets=None,
    dryrun=False,
    touch=False,
    forcetargets=False,
    forceall=False,
    forcerun=None,
    prioritytargets=None,
    stats=None,
    printreason=False,
    printshellcmds=False,
    printdag=False,
    printrulegraph=False,
    nocolor=False,
    quiet=False,
    keepgoing=False,
    cluster=None,
    immediate_submit=False,
    standalone=False,
    ignore_ambiguity=False,
    snakemakepath=None,
    lock=True,
    unlock=False,
    cleanup_metadata=None,
    force_incomplete=False,
    ignore_incomplete=False,
    list_version_changes=False,
    list_code_changes=False,
    list_input_changes=False,
    list_params_changes=False,
    summary=False,
    output_wait=3,
    print_compilation=False,
    debug=False,
    notemp=False,
    nodeps=False,
    jobscript=None,
    timestamp=False):
    """
    Run snakemake on a given snakefile.
    Note: at the moment, this function is not thread-safe!

    Arguments
    snakefile         -- the snakefile.
    list              -- list rules.
    jobs              -- maximum number of parallel jobs (default: 1).
    directory         -- working directory (default: current directory).
    rule              -- execute this rule (default: first rule in snakefile).
    dryrun            -- print the rules that would be executed,
        but do not execute them.
    forcethis         -- force the selected rule to be executed
    forceall          -- force all rules to be executed
    time_measurements -- measure the running times of all rules
    lock              -- lock the working directory
    """

    init_logger(nocolor=nocolor, stdout=dryrun, debug=debug, timestamp=timestamp)

    if not os.path.exists(snakefile):
        logger.error("Error: Snakefile \"{}\" not present.".format(snakefile))
        return False

    if workdir:
        olddir = os.getcwd()
    workflow = Workflow(
        snakefile=snakefile, snakemakepath=snakemakepath,
        jobscript=jobscript)

    if standalone:
        try:
            # set the process group
            os.setpgrp()
        except:
            # ignore: if it does not work we can still work without it
            pass

    success = True
    try:
        workflow.include(snakefile, workdir=workdir,
            overwrite_first_rule=True, print_compilation=print_compilation)
        workflow.check()

        if not print_compilation:
            if listrules:
                workflow.list_rules()
            else:
                if not printdag and not printrulegraph:
                    # handle subworkflows
                    subsnakemake = partial(
                        snakemake,
                        cores=cores,
                        resources=resources,
                        dryrun=dryrun,
                        touch=touch,
                        printreason=printreason,
                        printshellcmds=printshellcmds,
                        nocolor=nocolor,
                        quiet=quiet,
                        keepgoing=keepgoing,
                        cluster=cluster,
                        immediate_submit=immediate_submit,
                        standalone=standalone,
                        ignore_ambiguity=ignore_ambiguity,
                        snakemakepath=snakemakepath,
                        lock=lock,
                        unlock=unlock,
                        cleanup_metadata=cleanup_metadata,
                        force_incomplete=force_incomplete,
                        ignore_incomplete=ignore_incomplete,
                        output_wait=output_wait,
                        debug=debug,
                        notemp=notemp,
                        nodeps=nodeps,
                        jobscript=jobscript,
                        timestamp=timestamp)
                    for subworkflow in workflow.subworkflows:
                        logger.warning("Executing subworkflow {}.".format(subworkflow.name))
                        if not subsnakemake(subworkflow.snakefile, workdir=subworkflow.workdir, targets=subworkflow.targets):
                            success = False
                    if workflow.subworkflows:
                        logger.warning("Executing main workflow.")
                if success:
                    success = workflow.execute(
                        targets=targets, dryrun=dryrun, touch=touch,
                        cores=cores, forcetargets=forcetargets,
                        forceall=forceall, forcerun=forcerun,
                        prioritytargets=prioritytargets, quiet=quiet,
                        keepgoing=keepgoing, printshellcmds=printshellcmds,
                        printreason=printreason, printrulegraph=printrulegraph,
                        printdag=printdag, cluster=cluster,
                        immediate_submit=immediate_submit,
                        ignore_ambiguity=ignore_ambiguity,
                        workdir=workdir, stats=stats,
                        force_incomplete=force_incomplete,
                        ignore_incomplete=ignore_incomplete,
                        list_version_changes=list_version_changes,
                        list_code_changes=list_code_changes,
                        list_input_changes=list_input_changes,
                        list_params_changes=list_params_changes,
                        summary=summary,
                        output_wait=output_wait,
                        nolock=not lock,
                        unlock=unlock,
                        resources=resources,
                        notemp=notemp,
                        nodeps=nodeps,
                        cleanup_metadata=cleanup_metadata
                        )

    except (Exception, BaseException) as ex:
        print_exception(ex, workflow.linemaps)
        success = False
    if workdir:
        os.chdir(olddir)
    if workflow.persistence:
        workflow.persistence.unlock()
    return success
コード例 #39
0
ファイル: utils.py プロジェクト: gcfntnu/gcfdb
# load function for statistical models
def load_model(model_yaml_file):
    with open(model_yaml_file) as fh:
        MODELS = yaml.load(fh, Loader=Loader) or {}
        config['models'] = MODELS
        config['model_names'] = list(MODELS.keys())


# library preparation kit specific configuration
libprep_fn = srcdir('libprep.config')
with open(libprep_fn) as fh:
    LIBPREP_CONF = yaml.load(fh, Loader=Loader) or {}
kit = config.get('libprepkit')
if kit in LIBPREP_CONF:
    LIBPREP = LIBPREP_CONF[kit]
    if 'reference_db' in LIBPREP:
        config['db']['reference_db'] = LIBPREP['reference_db']
else:
    if kit is None:
        logger.warning('Running without LIBREPKIT defined!')
    else:
        logger.warning('`{}` is not a valid librepkit name'.format(kit))
        sys.exit()

# docker images
docker_fn = srcdir('docker.config')
with open(docker_fn) as fh:
    dck = yaml.load(fh, Loader=Loader) or {}
    update_config2(config, dck)