Example #1
0
import os
import re
import shutil
import subprocess as sp
from datetime import datetime
import time

from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider
from snakemake.exceptions import WorkflowError
from snakemake.common import lazy_property
from snakemake.logging import logger


if not shutil.which("gfal-copy"):
    raise WorkflowError(
        "The gfal-* commands need to be available for " "gfal remote support."
    )


class RemoteProvider(AbstractRemoteProvider):

    supports_default = True
    allows_directories = True

    def __init__(
        self,
        *args,
        keep_local=False,
        stay_on_remote=False,
        is_default=False,
        retry=5,
Example #2
0
    def job_selector_ilp(self, jobs):
        """
        Job scheduling by optimization of resource usage by solving ILP using pulp
        """
        import pulp
        from pulp import lpSum

        # assert self.resources["_cores"] > 0
        scheduled_jobs = {
            job: pulp.LpVariable(
                "job_{}".format(idx),
                lowBound=0,
                upBound=1,
                cat=pulp.LpInteger,
            )
            for idx, job in enumerate(jobs)
        }

        temp_files = {
            temp_file
            for job in jobs for temp_file in self.dag.temp_input(job)
        }

        temp_job_improvement = {
            temp_file: pulp.LpVariable("temp_file_{}".format(idx),
                                       lowBound=0,
                                       upBound=1,
                                       cat="Continuous")
            for idx, temp_file in enumerate(temp_files)
        }

        temp_file_deletable = {
            temp_file: pulp.LpVariable(
                "deletable_{}".format(idx),
                lowBound=0,
                upBound=1,
                cat=pulp.LpInteger,
            )
            for idx, temp_file in enumerate(temp_files)
        }
        prob = pulp.LpProblem("JobScheduler", pulp.LpMaximize)

        total_temp_size = max(
            sum([temp_file.size for temp_file in temp_files]), 1)
        total_core_requirement = sum(
            [job.resources.get("_cores", 1) + 1 for job in jobs])
        # Objective function
        # Job priority > Core load
        # Core load > temp file removal
        # Instant removal > temp size
        prob += (
            2 * total_core_requirement * 2 * total_temp_size *
            lpSum([job.priority * scheduled_jobs[job]
                   for job in jobs]) + 2 * total_temp_size *
            lpSum([(job.resources.get("_cores", 1) + 1) * scheduled_jobs[job]
                   for job in jobs]) + total_temp_size * lpSum([
                       temp_file_deletable[temp_file] * temp_file.size
                       for temp_file in temp_files
                   ]) + lpSum([
                       temp_job_improvement[temp_file] * temp_file.size
                       for temp_file in temp_files
                   ]))

        # Constraints:
        for name in self.workflow.global_resources:
            prob += (lpSum([
                scheduled_jobs[job] * job.resources.get(name, 0)
                for job in jobs
            ]) <= self.resources[name])

        # Choose jobs that lead to "fastest" (minimum steps) removal of existing temp file
        for temp_file in temp_files:
            prob += temp_job_improvement[temp_file] <= lpSum([
                scheduled_jobs[job] * self.required_by_job(temp_file, job)
                for job in jobs
            ]) / lpSum([self.required_by_job(temp_file, job) for job in jobs])

            prob += temp_file_deletable[temp_file] <= temp_job_improvement[
                temp_file]

        # disable extensive logging
        pulp.apis.LpSolverDefault.msg = False
        try:
            if self.scheduler_ilp_solver:
                prob.solve(pulp.get_solver(self.scheduler_ilp_solver))
            else:
                prob.solve()
        except pulp.apis.core.PulpSolverError as e:
            raise WorkflowError(
                "Failed to solve the job scheduling problem with pulp. "
                "Please report a bug and use --scheduler greedy as a workaround:\n{}"
                .format(e))

        selected_jobs = [
            job for job, variable in scheduled_jobs.items()
            if variable.value() == 1.0
        ]
        for name in self.workflow.global_resources:
            self.resources[name] -= sum(
                [job.resources.get(name, 0) for job in selected_jobs])
        return selected_jobs
Example #3
0
def run_wrapper(run,
                input,
                output,
                params,
                wildcards,
                threads,
                resources,
                log,
                version,
                benchmark,
                benchmark_repeats,
                linemaps,
                debug=False):
    """
    Wrapper around the run method that handles directory creation and
    output file deletion on error.

    Arguments
    run       -- the run method
    input     -- list of input files
    output    -- list of output files
    wildcards -- so far processed wildcards
    threads   -- usable threads
    log       -- list of log files
    """
    if os.name == "posix" and debug:
        sys.stdin = open('/dev/stdin')

    try:
        runs = 1 if benchmark is None else benchmark_repeats
        wallclock = []
        for i in range(runs):
            w = time.time()
            # execute the actual run method.
            run(input, output, params, wildcards, threads, resources, log,
                version)
            w = time.time() - w
            wallclock.append(w)

    except (KeyboardInterrupt, SystemExit) as e:
        # re-raise the keyboard interrupt in order to record an error in the scheduler but ignore it
        raise e
    except (Exception, BaseException) as ex:
        # this ensures that exception can be re-raised in the parent thread
        lineno, file = get_exception_origin(ex, linemaps)
        raise RuleException(
            format_error(ex,
                         lineno,
                         linemaps=linemaps,
                         snakefile=file,
                         show_traceback=True))

    if benchmark is not None:
        try:
            with open(benchmark, "w") as f:
                print("s", "h:m:s", sep="\t", file=f)
                for t in wallclock:
                    print(t,
                          str(datetime.timedelta(seconds=t)),
                          sep="\t",
                          file=f)
        except (Exception, BaseException) as ex:
            raise WorkflowError(ex)
import functools

# intra-module
from snakemake.remote.S3 import RemoteObject as S3RemoteObject, RemoteProvider as S3RemoteProvider
from snakemake.remote.S3 import S3Helper
from snakemake.decorators import dec_all_methods
from snakemake.exceptions import WorkflowError
from snakemake.logging import logger

try:
    # third-party
    import boto3
    from moto import mock_s3
    import filechunkio
except ImportError as e:
    raise WorkflowError("The Python 3 packages 'moto', boto' and 'filechunkio' " + 
        "need to be installed to use S3Mocked remote() file functionality. %s" % e.msg)

def noop():
    pass

def pickled_moto_wrapper(func):
    """
        This is a class decorator that in turn decorates all methods within
        a class to mock out boto calls with moto-simulated ones.
        Since the moto backends are not presistent across calls by default, 
        the wrapper also pickles the bucket state after each function call,
        and restores it before execution. This way uploaded files are available
        for follow-on tasks. Since snakemake may execute with multiple threads
        it also waits for the pickled bucket state file to be available before
        loading it in. This is a hackey alternative to using proper locks,
        but works ok in practice.
Example #5
0
__email__ = "*****@*****.**"
__license__ = "MIT"

import os

# module-specific
from snakemake.remote import AbstractRemoteProvider, AbstractRemoteObject
from snakemake.exceptions import DropboxFileException, WorkflowError
from snakemake.utils import os_sync

try:
    # third-party modules
    import dropbox  # The official Dropbox API library
except ImportError as e:
    raise WorkflowError("The Python 3 package 'dropbox' "
                        "must be installed to use Dropbox remote() file "
                        "functionality. %s" % e.msg)


class RemoteProvider(AbstractRemoteProvider):
    def __init__(self,
                 *args,
                 keep_local=False,
                 stay_on_remote=False,
                 is_default=False,
                 **kwargs):
        super(RemoteProvider, self).__init__(*args,
                                             keep_local=keep_local,
                                             stay_on_remote=stay_on_remote,
                                             is_default=is_default,
                                             **kwargs)
Example #6
0
from pytz import timezone

# module-specific
from snakemake.remote import AbstractRemoteProvider, AbstractRemoteObject
from snakemake.exceptions import WorkflowError

try:
    # third-party modules
    from irods.session import iRODSSession
    from irods.meta import iRODSMeta
    from irods.models import DataObject
    from irods.exception import CollectionDoesNotExist, DataObjectDoesNotExist
    import irods.keywords as kw
except ImportError as e:
    raise WorkflowError(
        "The Python 3 package 'python-irodsclient' " +
        "must be installed to use iRODS remote() file functionality. %s" %
        e.msg)

utc = datetime.utcfromtimestamp(0)


def _irods_session(*args, **kwargs):
    try:
        irods_env_file = os.environ["IRODS_ENVIRONMENT_FILE"]
    except KeyError:
        irods_env_file = kwargs.get(
            "irods_env_file",
            os.path.expanduser("~/.irods/irods_environment.json"))

    if not os.path.isfile(irods_env_file):
        raise WorkflowError(
Example #7
0
def auto_report(dag, path, stylesheet=None):
    try:
        from jinja2 import Template, Environment, PackageLoader
    except ImportError as e:
        raise WorkflowError(
            "Python package jinja2 must be installed to create reports.")

    mode_embedded = True
    if path.endswith(".zip"):
        mode_embedded = False
    elif not path.endswith(".html"):
        raise WorkflowError("Report file does not end with .html or .zip")

    custom_stylesheet = None
    if stylesheet is not None:
        try:
            with open(stylesheet) as s:
                custom_stylesheet = s.read()
        except (Exception, BaseException) as e:
            raise WorkflowError("Unable to read custom report stylesheet.", e)

    logger.info("Creating report...")

    env = Environment(
        loader=PackageLoader("snakemake", "report"),
        trim_blocks=True,
        lstrip_blocks=True,
    )
    env.filters["get_resource_as_string"] = get_resource_as_string

    persistence = dag.workflow.persistence
    results = defaultdict(lambda: defaultdict(list))
    records = defaultdict(JobRecord)
    recorded_files = set()
    for job in dag.jobs:
        for f in itertools.chain(job.expanded_output, job.input):
            if is_flagged(f, "report") and f not in recorded_files:
                if not f.exists:
                    raise WorkflowError("File {} marked for report but does "
                                        "not exist.".format(f))
                report_obj = get_flag_value(f, "report")

                def register_file(f, wildcards_overwrite=None):
                    wildcards = wildcards_overwrite or job.wildcards
                    category = Category(report_obj.category,
                                        wildcards=wildcards,
                                        job=job)
                    subcategory = Category(report_obj.subcategory,
                                           wildcards=wildcards,
                                           job=job)

                    results[category][subcategory].append(
                        FileRecord(
                            f,
                            job,
                            report_obj.caption,
                            env,
                            category,
                            wildcards_overwrite=wildcards_overwrite,
                            mode_embedded=mode_embedded,
                        ))
                    recorded_files.add(f)

                if os.path.isfile(f):
                    register_file(f)
                if os.path.isdir(f):
                    if not isinstance(report_obj.patterns, list):
                        raise WorkflowError(
                            "Invalid patterns given for report. Must be list.",
                            rule=job.rule,
                        )
                    if not report_obj.patterns:
                        raise WorkflowError(
                            "Directory marked for report but no file patterns given via patterns=[...]. "
                            "See report documentation.",
                            rule=job.rule,
                        )
                    for pattern in report_obj.patterns:
                        pattern = os.path.join(f, pattern)
                        wildcards = glob_wildcards(pattern)._asdict()
                        names = wildcards.keys()
                        for w in zip(*wildcards.values()):
                            w = dict(zip(names, w))
                            w.update(job.wildcards_dict)
                            w = Wildcards(fromdict=w)
                            f = apply_wildcards(pattern, w)
                            register_file(f, wildcards_overwrite=w)

        for f in job.expanded_output:
            meta = persistence.metadata(f)
            if not meta:
                logger.warning("Missing metadata for file {}. Maybe metadata "
                               "was deleted or it was created using an older "
                               "version of Snakemake. This is a non critical "
                               "warning.".format(f))
                continue
            try:
                job_hash = meta["job_hash"]
                rule = meta["rule"]
                rec = records[(job_hash, rule)]
                rec.rule = rule
                rec.job = job
                rec.starttime = min(rec.starttime, meta["starttime"])
                rec.endtime = max(rec.endtime, meta["endtime"])
                rec.conda_env_file = None
                rec.conda_env = meta["conda_env"]
                rec.container_img_url = meta["container_img_url"]
                rec.output.append(f)
            except KeyError as e:
                print(e)
                logger.warning("Metadata for file {} was created with a too "
                               "old Snakemake version.".format(f))

    for subcats in results.values():
        for catresults in subcats.values():
            catresults.sort(key=lambda res: res.name)

    # prepare runtimes
    runtimes = [{
        "rule": rec.rule,
        "runtime": rec.endtime - rec.starttime
    } for rec in sorted(records.values(), key=lambda rec: rec.rule)]

    # prepare end times
    timeline = [{
        "rule":
        rec.rule,
        "starttime":
        datetime.datetime.fromtimestamp(rec.starttime).isoformat(),
        "endtime":
        datetime.datetime.fromtimestamp(rec.endtime).isoformat(),
    } for rec in sorted(records.values(), key=lambda rec: rec.rule)]

    # prepare per-rule information
    rules = defaultdict(list)
    for rec in records.values():
        rule = RuleRecord(rec.job, rec)
        if rec.rule not in rules:
            rules[rec.rule].append(rule)
        else:
            merged = False
            for other in rules[rec.rule]:
                if rule == other:
                    other.add(rec)
                    merged = True
                    break
            if not merged:
                rules[rec.rule].append(rule)

    # rulegraph
    rulegraph, xmax, ymax = rulegraph_d3_spec(dag)

    # configfiles
    configfiles = [ConfigfileRecord(f) for f in dag.workflow.configfiles]

    seen = set()
    files = [
        seen.add(res.target) or res for cat in results.values()
        for subcat in cat.values() for res in subcat if res.target not in seen
    ]

    rst_links = textwrap.dedent("""

    .. _Workflow: javascript:show_panel('workflow')
    .. _Statistics: javascript:show_panel('statistics')
    {% for cat, catresults in categories|dictsort %}
    .. _{{ cat.name }}: javascript:show_panel("{{ cat.id }}")
    {% endfor %}
    {% for res in files %}
    .. _{{ res.target }}: javascript:show_panel("{{ res.category.id }}")
    {% endfor %}
    """)
    for cat, subcats in results.items():
        for subcat, catresults in subcats.items():
            for res in catresults:
                res.render(env, rst_links, results, files)

    # global description
    text = ""
    if dag.workflow.report_text:
        with open(dag.workflow.report_text) as f:

            class Snakemake:
                config = dag.workflow.config

            text = f.read() + rst_links
            text = publish_parts(
                env.from_string(text).render(snakemake=Snakemake,
                                             categories=results,
                                             files=files),
                writer_name="html",
            )["body"]

    # record time
    now = "{} {}".format(datetime.datetime.now().ctime(), time.tzname[0])
    results_size = sum(res.size for cat in results.values()
                       for subcat in cat.values() for res in subcat)

    try:
        from pygments.formatters import HtmlFormatter
    except ImportError:
        raise WorkflowError(
            "Python package pygments must be installed to create reports.")

    template = env.get_template("report.html")

    logger.info("Downloading resources and rendering HTML.")

    rendered = template.render(
        results=results,
        results_size=results_size,
        configfiles=configfiles,
        text=text,
        rulegraph_nodes=rulegraph["nodes"],
        rulegraph_links=rulegraph["links"],
        rulegraph_width=xmax + 20,
        rulegraph_height=ymax + 20,
        runtimes=runtimes,
        timeline=timeline,
        rules=[rec for recs in rules.values() for rec in recs],
        version=__version__,
        now=now,
        pygments_css=HtmlFormatter(style="trac").get_style_defs(".source"),
        custom_stylesheet=custom_stylesheet,
        mode_embedded=mode_embedded,
    )

    # TODO look into supporting .WARC format, also see (https://webrecorder.io)

    if not mode_embedded:
        with ZipFile(path, mode="w") as zipout:
            folder = Path(Path(path).stem)
            # store results in data folder
            for subcats in results.values():
                for catresults in subcats.values():
                    for result in catresults:
                        # write raw data
                        if result.table_content is not None:
                            zipout.writestr(
                                str(folder.joinpath(result.data_uri)),
                                result.table_content,
                            )
                        else:
                            zipout.write(result.path,
                                         str(folder.joinpath(result.data_uri)))
                        # write thumbnail
                        if result.is_img and result.png_content:
                            zipout.writestr(
                                str(folder.joinpath(result.png_uri)),
                                result.png_content)

            # write report html
            zipout.writestr(str(folder.joinpath("report.html")), rendered)
    else:
        with open(path, "w", encoding="utf-8") as htmlout:
            htmlout.write(rendered)

    logger.info("Report created: {}.".format(path))
Example #8
0
import time

from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider
from snakemake.exceptions import WorkflowError, CheckSumMismatchException
from snakemake.common import lazy_property
import snakemake.io
from snakemake.utils import os_sync

try:
    import google.cloud
    from google.cloud import storage
    from google.api_core import retry
    from google_crc32c import Checksum
except ImportError as e:
    raise WorkflowError(
        "The Python 3 packages 'google-cloud-sdk' and `google-crc32c` "
        "need to be installed to use GS remote() file functionality. %s" %
        e.msg)


def google_cloud_retry_predicate(ex):
    """Given an exception from Google Cloud, determine if it's one in the
    listing of transient errors (determined by function
    google.api_core.retry.if_transient_error(exception)) or determine if
    triggered by a hash mismatch due to a bad download. This function will
    return a boolean to indicate if retry should be done, and is typically
    used with the google.api_core.retry.Retry as a decorator (predicate).

    Arguments:
      ex (Exception) : the exception passed from the decorated function
    Returns: boolean to indicate doing retry (True) or not (False)
    """
Example #9
0
 def parse(self):
     m = re.search("(?P<bucket>[^/]*)/(?P<key>.*)", self.local_file())
     if len(m.groups()) != 2:
         raise WorkflowError("GS remote file {} does not have the form "
                             "<bucket>/<key>.".format(self.local_file()))
     return m
Example #10
0
    def _apply_wildcards(
        self,
        newitems,
        olditems,
        wildcards,
        concretize=None,
        check_return_type=True,
        omit_callable=False,
        mapping=None,
        no_flattening=False,
        aux_params=None,
        apply_path_modifier=True,
        property=None,
        incomplete_checkpoint_func=lambda e: None,
        allow_unpack=True,
    ):
        if aux_params is None:
            aux_params = dict()
        for name, item in olditems._allitems():
            start = len(newitems)
            is_unpack = is_flagged(item, "unpack")
            _is_callable = is_callable(item)

            if _is_callable:
                if omit_callable:
                    continue
                item, incomplete = self.apply_input_function(
                    item,
                    wildcards,
                    incomplete_checkpoint_func=incomplete_checkpoint_func,
                    is_unpack=is_unpack,
                    **aux_params
                )
                if apply_path_modifier:
                    item = self.apply_path_modifier(item, property=property)

            if is_unpack and not incomplete:
                if not allow_unpack:
                    raise WorkflowError(
                        "unpack() is not allowed with params. "
                        "Simply return a dictionary which can be directly ."
                        "used, e.g. via {params[mykey]}."
                    )
                # Sanity checks before interpreting unpack()
                if not isinstance(item, (list, dict)):
                    raise WorkflowError(
                        "Can only use unpack() on list and dict", rule=self
                    )
                if name:
                    raise WorkflowError(
                        "Cannot combine named input file with unpack()", rule=self
                    )
                # Allow streamlined code with/without unpack
                if isinstance(item, list):
                    pairs = zip([None] * len(item), item)
                else:
                    assert isinstance(item, dict)
                    pairs = item.items()
            else:
                pairs = [(name, item)]

            for name, item in pairs:
                is_iterable = True
                if not_iterable(item) or no_flattening:
                    item = [item]
                    is_iterable = False
                for item_ in item:
                    if (
                        check_return_type
                        and not isinstance(item_, str)
                        and not isinstance(item_, Path)
                    ):
                        raise WorkflowError(
                            "Function did not return str or list " "of str.", rule=self
                        )
                    concrete = concretize(item_, wildcards, _is_callable)
                    newitems.append(concrete)
                    if mapping is not None:
                        mapping[concrete] = item_

                if name:
                    newitems._set_name(
                        name, start, end=len(newitems) if is_iterable else None
                    )
                    start = len(newitems)
Example #11
0
def job_to_cwl(job, dag, outputs, inputs):
    """Convert a job with its dependencies to a CWL workflow step."""

    if job.dynamic_output:
        raise WorkflowError(
            "Dynamic output is not supported by CWL conversion.")
    for f in job.output:
        if os.path.isabs(f):
            raise WorkflowError("All output files have to be relative to the "
                                "working directory.")

    get_output_id = lambda job, i: "#main/job-{}/{}".format(job.jobid, i)

    dep_ids = {
        o: get_output_id(dep, i)
        for dep, files in dag.dependencies[job].items()
        for i, o in enumerate(dep.output) if o in files
    }
    files = [f for f in job.input if f not in dep_ids]
    if job.conda_env_file:
        files.add(os.path.relpath(job.conda_env_file))

    out = [get_output_id(job, i) for i, _ in enumerate(job.output)]

    def workdir_entry(i, f):
        location = "??inputs.input_files[{}].location??".format(i)
        if f.is_directory:
            entry = {
                "class": "Directory",
                "basename": os.path.basename(f),
                "location": location,
            }
        else:
            entry = {
                "class": "File",
                "basename": os.path.basename(f),
                "location": location,
            }
        return "$({})".format(
            json.dumps(outer_entry(f, entry)).replace('"??', "").replace(
                '??"', "")).replace('"', "'")

    def outer_entry(f, entry):
        parent = os.path.dirname(f)
        if parent:
            return outer_entry(
                parent,
                {
                    "class": "Directory",
                    "basename": os.path.basename(parent),
                    "listing": [entry],
                },
            )
        else:
            return entry

    if job in dag.targetjobs:
        # TODO this maps output files into the cwd after the workflow is complete.
        # We need to find a way to define subdirectories though. Otherwise,
        # there can be name clashes, and it will also become very crowded.
        outputs.append({
            "type": {
                "type": "array",
                "items": "File"
            },
            "outputSource":
            "#main/job-{}/output_files".format(job.jobid),
            "id":
            "#main/output/job-{}".format(job.jobid),
        })

    cwl = {
        "run": "#snakemake-job",
        "requirements": {
            "InitialWorkDirRequirement": {
                "listing": [{
                    "writable": True,
                    "entry": workdir_entry(i, f)
                } for i, f in enumerate(
                    chain(
                        files,
                        (f for dep in dag.dependencies[job]
                         for f in dep.output),
                    ))]
            }
        },
        "in": {
            "cores": {
                "default": job.threads
            },
            "target_files": {
                "default": job.output._plainstrings()
            },
            "rules": {
                "default": [job.rule.name]
            },
        },
        "out": ["output_files"],
        "id": "#main/job-{}".format(job.jobid),
    }
    if files:
        inputs.append({
            "type": {
                "type": "array",
                "items": "File"
            },
            "default": [{
                "class": "File",
                "location": f
            } for f in files],
            "id":
            "#main/input/job-{}".format(job.jobid),
        })

    input_files = []
    if files:
        input_files.append("#main/input/job-{}".format(job.jobid))
    input_files.extend("#main/job-{}/output_files".format(dep.jobid)
                       for dep in dag.dependencies[job])

    cwl["in"]["input_files"] = {
        "source": input_files,
        "linkMerge": "merge_flattened"
    }

    return cwl
Example #12
0
    def _set_inoutput_item(self, item, output=False, name=None):
        """
        Set an item to be input or output.

        Arguments
        item     -- the item
        inoutput -- a Namedlist of either input or output items
        name     -- an optional name for the item
        """
        inoutput = self.output if output else self.input

        # Check to see if the item is a path, if so, just make it a string
        if isinstance(item, Path):
            item = str(item)
        if isinstance(item, str):
            if ON_WINDOWS:
                if isinstance(item, (_IOFile, AnnotatedString)):
                    item = item.new_from(item.replace(os.sep, os.altsep))
                else:
                    item = item.replace(os.sep, os.altsep)

            rule_dependency = None
            if isinstance(item, _IOFile) and item.rule and item in item.rule.output:
                rule_dependency = item.rule

            item = self.apply_path_modifier(
                item, property="output" if output else "input"
            )

            # Check to see that all flags are valid
            # Note that "remote", "dynamic", and "expand" are valid for both inputs and outputs.
            if isinstance(item, AnnotatedString):
                for flag in item.flags:
                    if not output and flag in [
                        "protected",
                        "temp",
                        "temporary",
                        "directory",
                        "touch",
                        "pipe",
                    ]:
                        logger.warning(
                            "The flag '{}' used in rule {} is only valid for outputs, not inputs.".format(
                                flag, self
                            )
                        )
                    if output and flag in ["ancient"]:
                        logger.warning(
                            "The flag '{}' used in rule {} is only valid for inputs, not outputs.".format(
                                flag, self
                            )
                        )

            # add the rule to the dependencies
            if rule_dependency is not None:
                self.dependencies[item] = rule_dependency
            if output:
                item = self._update_item_wildcard_constraints(item)
            else:
                if (
                    contains_wildcard_constraints(item)
                    and self.workflow.mode != Mode.subprocess
                ):
                    logger.warning(
                        "Wildcard constraints in inputs are ignored. (rule: {})".format(
                            self
                        )
                    )

            if self.workflow.all_temp and output:
                # mark as temp if all output files shall be marked as temp
                item = snakemake.io.flag(item, "temp")

            # record rule if this is an output file output
            _item = IOFile(item, rule=self)

            if is_flagged(item, "temp"):
                if output:
                    self.temp_output.add(_item)
            if is_flagged(item, "protected"):
                if output:
                    self.protected_output.add(_item)
            if is_flagged(item, "touch"):
                if output:
                    self.touch_output.add(_item)
            if is_flagged(item, "dynamic"):
                if output:
                    self.dynamic_output.add(_item)
                else:
                    self.dynamic_input.add(_item)
            if is_flagged(item, "report"):
                report_obj = item.flags["report"]
                if report_obj.caption is not None:
                    r = ReportObject(
                        self.workflow.current_basedir.join(report_obj.caption),
                        report_obj.category,
                        report_obj.subcategory,
                        report_obj.patterns,
                        report_obj.htmlindex,
                    )
                    item.flags["report"] = r
            if is_flagged(item, "subworkflow"):
                if output:
                    raise SyntaxError("Only input files may refer to a subworkflow")
                else:
                    # record the workflow this item comes from
                    sub = item.flags["subworkflow"]
                    if _item in self.subworkflow_input:
                        other = self.subworkflow_input[_item]
                        if sub != other:
                            raise WorkflowError(
                                "The input file {} is ambiguously "
                                "associated with two subworkflows "
                                "{} and {}.".format(item, sub, other),
                                rule=self,
                            )
                    self.subworkflow_input[_item] = sub
            inoutput.append(_item)
            if name:
                inoutput._add_name(name)
        elif callable(item):
            if output:
                raise SyntaxError("Only input files can be specified as functions")
            inoutput.append(item)
            if name:
                inoutput._add_name(name)
        else:
            try:
                start = len(inoutput)
                for i in item:
                    self._set_inoutput_item(i, output=output)
                if name:
                    # if the list was named, make it accessible
                    inoutput._set_name(name, start, end=len(inoutput))
            except TypeError:
                raise SyntaxError(
                    "Input and output files have to be specified as strings or lists of strings."
                )
Example #13
0
 def version(self, version):
     if isinstance(version, str) and "\n" in version:
         raise WorkflowError(
             "Version string may not contain line breaks.", rule=self
         )
     self._version = version
Example #14
0
    def create_archive(self):
        """Create self-contained archive of environment."""
        from snakemake.shell import shell

        try:
            import yaml
        except ImportError:
            raise WorkflowError("Error importing PyYAML. "
                                "Please install PyYAML to archive workflows.")
        # importing requests locally because it interferes with instantiating conda environments
        import requests

        env_archive = self.archive_file
        if os.path.exists(env_archive):
            return env_archive

        try:
            # Download
            logger.info(
                "Downloading packages for conda environment {}...".format(
                    self.file))
            os.makedirs(env_archive, exist_ok=True)
            try:
                out = shell.check_output(
                    "conda list --explicit --prefix '{}'".format(self.path),
                    stderr=subprocess.STDOUT,
                )
                logger.debug(out.decode())
            except subprocess.CalledProcessError as e:
                raise WorkflowError("Error exporting conda packages:\n" +
                                    e.output.decode())
            with open(os.path.join(env_archive, "packages.txt"),
                      "w") as pkg_list:
                for l in out.decode().split("\n"):
                    if l and not l.startswith("#") and not l.startswith("@"):
                        pkg_url = l
                        logger.info(pkg_url)
                        parsed = urlparse(pkg_url)
                        pkg_name = os.path.basename(parsed.path)
                        # write package name to list
                        print(pkg_name, file=pkg_list)
                        # download package
                        pkg_path = os.path.join(env_archive, pkg_name)
                        with open(pkg_path, "wb") as copy:
                            r = requests.get(pkg_url)
                            r.raise_for_status()
                            copy.write(r.content)
                        try:
                            tarfile.open(pkg_path)
                        except:
                            raise WorkflowError(
                                "Package is invalid tar archive: {}".format(
                                    pkg_url))
        except (
                requests.exceptions.ChunkedEncodingError,
                requests.exceptions.HTTPError,
        ) as e:
            shutil.rmtree(env_archive)
            raise WorkflowError(
                "Error downloading conda package {}.".format(pkg_url))
        except (Exception, BaseException) as e:
            shutil.rmtree(env_archive)
            raise e
        return env_archive
Example #15
0
__email__ = "*****@*****.**"
__license__ = "MIT"

import os
from contextlib import contextmanager

# module-specific
from snakemake.remote import AbstractRemoteProvider, DomainObject
from snakemake.exceptions import SFTPFileException, WorkflowError

try:
    # third-party modules
    import pysftp
except ImportError as e:
    raise WorkflowError(
        "The Python 3 package 'pysftp' " +
        "must be installed to use SFTP remote() file functionality. %s" %
        e.msg)


class RemoteProvider(AbstractRemoteProvider):

    supports_default = True
    allows_directories = True

    def __init__(self,
                 *args,
                 keep_local=False,
                 stay_on_remote=False,
                 is_default=False,
                 mkdir_remote=True,
                 **kwargs):
Example #16
0
def expand(*args, **wildcards):
    """
    Expand wildcards in given filepatterns.

    Arguments
    *args -- first arg: filepatterns as list or one single filepattern,
        second arg (optional): a function to combine wildcard values
        (itertools.product per default)
    **wildcards -- the wildcards as keyword arguments
        with their values as lists. If allow_missing=True is included
        wildcards in filepattern without values will stay unformatted.
    """
    filepatterns = args[0]
    if len(args) == 1:
        combinator = product
    elif len(args) == 2:
        combinator = args[1]
    if isinstance(filepatterns, str) or isinstance(filepatterns, Path):
        filepatterns = [filepatterns]

    def path_to_str(f):
        if isinstance(f, Path):
            return str(f)
        return f

    filepatterns = list(map(path_to_str, filepatterns))

    if any(map(lambda f: getattr(f, "flags", {}), filepatterns)):
        raise WorkflowError(
            "Flags in file patterns given to expand() are invalid. "
            "Flags (e.g. temp(), directory()) have to be applied outside "
            "of expand (e.g. 'temp(expand(\"plots/{sample}.pdf\", sample=SAMPLES))')."
        )

    # check if remove missing is provided
    format_dict = dict
    if "allow_missing" in wildcards and wildcards["allow_missing"] is True:

        class FormatDict(dict):
            def __missing__(self, key):
                return "{" + key + "}"

        format_dict = FormatDict
        # check that remove missing is not a wildcard in the filepatterns
        for filepattern in filepatterns:
            if "allow_missing" in re.findall(r"{([^}\.[!:]+)", filepattern):
                format_dict = dict
                break

    # remove unused wildcards to avoid duplicate filepatterns
    wildcards = {
        filepattern: {
            k: v
            for k, v in wildcards.items()
            if k in re.findall(r"{([^}\.[!:]+)", filepattern)
        }
        for filepattern in filepatterns
    }

    def flatten(wildcards):
        for wildcard, values in wildcards.items():
            if isinstance(
                    values,
                    str) or not isinstance(values, collections.abc.Iterable):
                values = [values]
            yield [(wildcard, value) for value in values]

    formatter = string.Formatter()
    try:
        return [
            formatter.vformat(filepattern, (), comb)
            for filepattern in filepatterns for comb in map(
                format_dict, combinator(*flatten(wildcards[filepattern])))
        ]
    except KeyError as e:
        raise WildcardError("No values given for wildcard {}.".format(e))
Example #17
0
import concurrent.futures

# snakemake specific
from snakemake.common import lazy_property

# module specific
from snakemake.exceptions import WorkflowError, AzureFileException
from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider

# service provider support
try:
    from azure.storage.common.cloudstorageaccount import (
        CloudStorageAccount as AzureStorageAccount, )
except ImportError as e:
    raise WorkflowError(
        "The Python 3 packages 'azure-storage' and 'azure-storage-common' "
        "need to be installed to use Azure Storage remote() file functionality. %s"
        % e.msg)


class RemoteProvider(AbstractRemoteProvider):

    supports_default = True

    def __init__(self,
                 *args,
                 keep_local=False,
                 stay_on_remote=False,
                 is_default=False,
                 **kwargs):
        super(RemoteProvider, self).__init__(*args,
                                             keep_local=keep_local,
Example #18
0
def report(text,
           path,
           stylesheet=None,
           defaultenc="utf8",
           template=None,
           metadata=None,
           **files):
    """Create an HTML report using python docutils.

    This is deprecated in favor of the --report flag.

    Attention: This function needs Python docutils to be installed for the
    python installation you use with Snakemake.

    All keywords not listed below are intepreted as paths to files that shall
    be embedded into the document. They keywords will be available as link
    targets in the text. E.g. append a file as keyword arg via F1=input[0]
    and put a download link in the text like this:

    .. code:: python

        report('''
        ==============
        Report for ...
        ==============

        Some text. A link to an embedded file: F1_.

        Further text.
        ''', outputpath, F1=input[0])

        Instead of specifying each file as a keyword arg, you can also expand
        the input of your rule if it is completely named, e.g.:

        report('''
        Some text...
        ''', outputpath, **input)

    Args:
        text (str):         The "restructured text" as it is expected by python docutils.
        path (str):         The path to the desired output file
        stylesheet (str):   An optional path to a css file that defines the style of the document. This defaults to <your snakemake install>/report.css. Use the default to get a hint how to create your own.
        defaultenc (str):   The encoding that is reported to the browser for embedded text files, defaults to utf8.
        template (str):     An optional path to a docutils HTML template.
        metadata (str):     E.g. an optional author name or email address.

    """
    if stylesheet is None:
        os.path.join(os.path.dirname(__file__), "report.css")
    try:
        import snakemake.report
    except ImportError:
        raise WorkflowError(
            "Python 3 package docutils needs to be installed to use the report function."
        )
    snakemake.report.report(text,
                            path,
                            stylesheet=stylesheet,
                            defaultenc=defaultenc,
                            template=template,
                            metadata=metadata,
                            **files)
Example #19
0
def get_resource_as_string(url):
    r = requests.get(url)
    if r.status_code == requests.codes.ok:
        return r.text
    raise WorkflowError("Failed to download resource needed for "
                        "report: {}".format(url))
Example #20
0
    def _get_provenance_hash(self, job: Job):
        """
        Recursively calculate hash for the output of the given job
        and all upstream jobs in a blockchain fashion.

        This is based on an idea of Sven Nahnsen.
        Fails if job has more than one output file. The reason is that there
        is no way to generate a per-output file hash without generating the files.
        This hash, however, shall work without having to generate the files,
        just by describing all steps down to a given job.
        """
        if job in self._hashes:
            return self._hashes[job]

        workflow = job.dag.workflow
        h = hashlib.sha256()

        # Hash shell command or script.
        if job.is_shell:
            # We cannot use the formatted shell command, because it also contains threads,
            # resources, and filenames (which shall be irrelevant for the hash).
            h.update(job.rule.shellcmd.encode())
        elif job.is_script:
            _, source, _ = script.get_source(
                job.rule.script,
                basedir=job.rule.basedir,
                wildcards=job.wildcards,
                params=job.params,
            )
            h.update(source)
        elif job.is_notebook:
            _, source, _ = script.get_source(
                job.rule.notebook,
                basedir=job.rule.basedir,
                wildcards=job.wildcards,
                params=job.params,
            )
            h.update(source)
        elif job.is_wrapper:
            _, source, _ = script.get_source(
                wrapper.get_script(job.rule.wrapper,
                                   prefix=workflow.wrapper_prefix),
                basedir=job.rule.basedir,
                wildcards=job.wildcards,
                params=job.params,
            )
            h.update(source)

        # Hash params.
        for key, value in sorted(job.params._allitems()):
            if key is not None:
                h.update(key.encode())
            # If this raises a TypeError, we cannot calculate a reliable hash.
            try:
                h.update(json.dumps(value, sort_keys=True).encode())
            except TypeError as e:
                raise WorkflowError(
                    "Rule {} cannot be cached, because params "
                    "are not JSON serializable. "
                    "Consider converting them into a suitable format "
                    "if you are sure that caching is necessary. "
                    "Otherwise, deactivate caching for this rule "
                    "by removing it from the --cache command line argument "
                    "or removing the cache: true directive from the rule itself."
                    .format(job.rule.name),
                    e,
                )

        # Hash input files that are not generated by other jobs (sorted by hash value).
        for file_hash in sorted(
                hash_file(f) for f in job.input if not any(
                    f in depfiles
                    for depfiles in job.dag.dependencies[job].values())):
            h.update(file_hash.encode())

        # Hash used containers or conda environments.
        if workflow.use_conda and job.conda_env:
            if workflow.use_singularity and job.conda_env.container_img_url:
                h.update(job.conda_env.container_img_url.encode())
            h.update(job.conda_env.content)
        elif workflow.use_singularity and job.container_img_url:
            h.update(job.container_img_url.encode())

        # Generate hashes of dependencies, and add them in a blockchain fashion (as input to the current hash, sorted by hash value).
        for dep_hash in sorted(
                self._get_provenance_hash(dep)
                for dep in set(job.dag.dependencies[job].keys())):
            h.update(dep_hash.encode())

        provenance_hash = h.hexdigest()

        # Store for re-use.
        self._hashes[job] = provenance_hash

        return provenance_hash
Example #21
0
    def __init__(self,
                 rule,
                 dag,
                 wildcards_dict=None,
                 format_wildcards=None,
                 targetfile=None):
        self.rule = rule
        self.dag = dag

        # the targetfile that led to the job
        # it is important to record this, since we need it to submit the
        # job on a cluster. In contrast, an arbitrary targetfile could
        # lead to a different composition of wildcard values (in case of
        # ambiguity in matching).
        self.targetfile = targetfile
        self.wildcards_dict = wildcards_dict
        self.wildcards = Wildcards(fromdict=self.wildcards_dict)
        self._format_wildcards = (self.wildcards if format_wildcards is None
                                  else Wildcards(fromdict=format_wildcards))

        self.input, input_mapping, self.dependencies = self.rule.expand_input(
            self.wildcards_dict)
        self.output, output_mapping = self.rule.expand_output(
            self.wildcards_dict)
        # other properties are lazy to be able to use additional parameters and check already existing files
        self._params = None
        self._log = None
        self._benchmark = None
        self._resources = None
        self._conda_env_file = None
        self._conda_env = None
        self._group = None

        self.shadow_dir = None
        self._inputsize = None
        self.is_updated = False

        self._attempt = self.dag.workflow.attempt

        # TODO get rid of these
        self.dynamic_output, self.dynamic_input = set(), set()
        self.temp_output, self.protected_output = set(), set()
        self.touch_output = set()
        self.subworkflow_input = dict()
        for f in self.output:
            f_ = output_mapping[f]
            if f_ in self.rule.dynamic_output:
                self.dynamic_output.add(f)
            if f_ in self.rule.temp_output:
                self.temp_output.add(f)
            if f_ in self.rule.protected_output:
                self.protected_output.add(f)
            if f_ in self.rule.touch_output:
                self.touch_output.add(f)
        for f in self.input:
            f_ = input_mapping[f]
            if f_ in self.rule.dynamic_input:
                self.dynamic_input.add(f)
            if f_ in self.rule.subworkflow_input:
                self.subworkflow_input[f] = self.rule.subworkflow_input[f_]
            elif "subworkflow" in f.flags:
                sub = f.flags["subworkflow"]
                if f in self.subworkflow_input:
                    other = self.subworkflow_input[f]
                    if sub != other:
                        raise WorkflowError(
                            "The input file {} is ambiguously "
                            "associated with two subworkflows {} "
                            "and {}.".format(f, sub, other),
                            rule=self.rule)
                self.subworkflow_input[f] = sub
        self._hash = self.rule.__hash__()
        for wildcard_value in self.wildcards_dict.values():
            self._hash ^= wildcard_value.__hash__()
Example #22
0
    def _set_location(self, location=None):
        """The location is where the Google Life Sciences API is located.
        This can be meaningful if the requester has data residency
        requirements or multi-zone needs. To determine this value,
        we first use the locations API to determine locations available,
        and then compare them against:

        1. user specified location or prefix
        2. regions having the same prefix
        3. if cannot be satisifed, we throw an error.
        """
        # Derive available locations
        # See https://cloud.google.com/life-sciences/docs/concepts/locations
        locations = (self._api.projects().locations().list(
            name="projects/{}".format(self.project)).execute())

        locations = {
            x["locationId"]: x["name"]
            for x in locations.get("locations", [])
        }

        # Alert the user about locations available
        logger.debug("locations-available:\n%s" % "\n".join(locations))

        # If no locations, there is something wrong
        if not locations:
            raise WorkflowError(
                "No locations found for Google Life Sciences API.")

        # First pass, attempt to match the user-specified location (or prefix)
        if location:
            if location in locations:
                self.location = locations[location]
                return

            # It could be that a prefix was provided
            for contender in locations:
                if contender.startswith(location):
                    self.location = locations[contender]
                    return

            # If we get here and no match, alert user.
            raise WorkflowError(
                "Location or prefix requested %s is not available." % location)

        # If we get here, we need to select location from regions
        for region in self.regions:
            if region in locations:
                self.location = locations[region]
                return

        # If we get here, choose based on prefix
        prefixes = set([r.split("-")[0] for r in self.regions])
        regexp = "^(%s)" % "|".join(prefixes)
        for location in locations:
            if re.search(regexp, location):
                self.location = locations[location]
                return

        # If we get here, total failure of finding location
        raise WorkflowError(
            " No locations available for regions!"
            " Please specify a location with --google-lifesciences-location "
            " or extend --google-lifesciences-regions to find a Life Sciences location."
        )
Example #23
0
def validate(data, schema, set_default=True):
    """Validate data with JSON schema at given path.

    Args:
        data (object): data to validate. Can be a config dict or a pandas data frame.
        schema (str): Path to JSON schema used for validation. The schema can also be
            in YAML format. If validating a pandas data frame, the schema has to
            describe a row record (i.e., a dict with column names as keys pointing
            to row values). See http://json-schema.org. The path is interpreted
            relative to the Snakefile when this function is called.
        set_default (bool): set default values defined in schema. See
            http://python-jsonschema.readthedocs.io/en/latest/faq/ for more
            information
    """
    try:
        import jsonschema
        from jsonschema import validators, RefResolver
    except ImportError:
        raise WorkflowError(
            "The Python 3 package jsonschema must be installed "
            "in order to use the validate directive.")

    if not os.path.isabs(schema):
        frame = inspect.currentframe().f_back
        # if workflow object is not available this has not been started from a workflow
        if "workflow" in frame.f_globals:
            workflow = frame.f_globals["workflow"]
            schema = os.path.join(workflow.current_basedir, schema)

    schemafile = schema
    schema = _load_configfile(schema, filetype="Schema")
    resolver = RefResolver(
        urljoin("file:", schemafile),
        schema,
        handlers={
            "file": lambda uri: _load_configfile(re.sub("^file://", "", uri))
        },
    )

    # Taken from http://python-jsonschema.readthedocs.io/en/latest/faq/
    def extend_with_default(validator_class):
        validate_properties = validator_class.VALIDATORS["properties"]

        def set_defaults(validator, properties, instance, schema):
            for property, subschema in properties.items():
                if "default" in subschema:
                    instance.setdefault(property, subschema["default"])

            for error in validate_properties(validator, properties, instance,
                                             schema):
                yield error

        return validators.extend(validator_class, {"properties": set_defaults})

    Validator = validators.validator_for(schema)
    if Validator.META_SCHEMA["$schema"] != schema["$schema"]:
        logger.warning(
            "No validator found for JSON Schema version identifier '{}'".
            format(schema["$schema"]))
        logger.warning(
            "Defaulting to validator for JSON Schema version '{}'".format(
                Validator.META_SCHEMA["$schema"]))
        logger.warning("Note that schema file may not be validated correctly.")
    DefaultValidator = extend_with_default(Validator)

    if not isinstance(data, dict):
        try:
            import pandas as pd

            recordlist = []
            if isinstance(data, pd.DataFrame):
                for i, record in enumerate(data.to_dict("records")):
                    record = {
                        k: v
                        for k, v in record.items() if not pd.isnull(v)
                    }
                    try:
                        if set_default:
                            DefaultValidator(
                                schema, resolver=resolver).validate(record)
                            recordlist.append(record)
                        else:
                            jsonschema.validate(record,
                                                schema,
                                                resolver=resolver)
                    except jsonschema.exceptions.ValidationError as e:
                        raise WorkflowError(
                            "Error validating row {} of data frame.".format(i),
                            e)
                if set_default:
                    newdata = pd.DataFrame(recordlist, data.index)
                    newcol = ~newdata.columns.isin(data.columns)
                    n = len(data.columns)
                    for col in newdata.loc[:, newcol].columns:
                        data.insert(n, col, newdata.loc[:, col])
                        n = n + 1
                return
        except ImportError:
            pass
        raise WorkflowError("Unsupported data type for validation.")
    else:
        try:
            if set_default:
                DefaultValidator(schema, resolver=resolver).validate(data)
            else:
                jsonschema.validate(data, schema, resolver=resolver)
        except jsonschema.exceptions.ValidationError as e:
            raise WorkflowError("Error validating config file.", e)
Example #24
0
    def _generate_job_resources(self, job):
        """given a particular job, generate the resources that it needs,
        including default regions and the virtual machine configuration
        """
        # Right now, do a best effort mapping of resources to instance types
        cores = job.resources.get("_cores", 1)
        mem_mb = job.resources.get("mem_mb", 15360)

        # IOPS performance proportional to disk size
        disk_mb = job.resources.get("disk_mb", 512000)

        # Convert mb to gb
        disk_gb = math.ceil(disk_mb / 1024)

        # Look for if the user wants an nvidia gpu
        gpu_count = job.resources.get("nvidia_gpu") or job.resources.get("gpu")
        gpu_model = job.resources.get("gpu_model")

        # If a gpu model is specified without a count, we assume 1
        if gpu_model and not gpu_count:
            gpu_count = 1

        # Update default resources using decided memory and disk
        self.workflow.default_resources = self.default_resources
        self.workflow.default_resources.args = [
            "mem_mb=%s" % mem_mb,
            "disk_mb=%s" % disk_mb,
        ]
        self.workflow.default_resources.parsed["mem_mb"] = mem_mb
        self.workflow.default_resources.parsed["disk_mb"] = disk_mb

        # Job resource specification can be overridden by gpu preferences
        self.machine_type_prefix = job.resources.get("machine_type")

        # If gpu wanted, limit to N1 general family, and update arguments
        if gpu_count:
            self._add_gpu(gpu_count)

        machine_types = self.get_available_machine_types()

        # Alert the user of machine_types available before filtering
        # https://cloud.google.com/compute/docs/machine-types
        logger.debug(
            "found {} machine types across regions {} before filtering "
            "to increase selection, define fewer regions".format(
                len(machine_types), self.regions))

        # First pass - eliminate anything that too low in cpu/memory
        keepers = dict()

        # Also keep track of max cpus and memory, in case none available
        max_cpu = 1
        max_mem = 15360

        for name, machine_type in machine_types.items():
            max_cpu = max(max_cpu, machine_type["guestCpus"])
            max_mem = max(max_mem, machine_type["memoryMb"])
            if machine_type["guestCpus"] < cores or machine_type[
                    "memoryMb"] < mem_mb:
                continue
            keepers[name] = machine_type

        # If a prefix is set, filter down to it
        if self.machine_type_prefix:
            machine_types = keepers
            keepers = dict()
            for name, machine_type in machine_types.items():
                if name.startswith(self.machine_type_prefix):
                    keepers[name] = machine_type

        # If we don't have any contenders, workflow error
        if not keepers:
            if self.machine_type_prefix:
                raise WorkflowError(
                    "Machine prefix {prefix} is too strict, or the resources cannot "
                    " be satisfied, so there are no options "
                    "available.".format(prefix=self.machine_type_prefix))
            else:
                raise WorkflowError(
                    "You requested {requestMemory} MB memory, {requestCpu} cores. "
                    "The maximum available are {availableMemory} MB memory and "
                    "{availableCpu} cores. These resources cannot be satisfied. "
                    "Please consider reducing the resource requirements of the "
                    "corresponding rule.".format(
                        requestMemory=mem_mb,
                        requestCpu=cores,
                        availableCpu=max_cpu,
                        availableMemory=max_mem,
                    ))

        # Now find (quasi) minimal to satisfy constraints
        machine_types = keepers

        # Select the first as the "smallest"
        smallest = list(machine_types.keys())[0]
        min_cores = machine_types[smallest]["guestCpus"]
        min_mem = machine_types[smallest]["memoryMb"]

        for name, machine_type in machine_types.items():
            if (machine_type["guestCpus"] < min_cores
                    and machine_type["memoryMb"] < min_mem):
                smallest = name
                min_cores = machine_type["guestCpus"]
                min_mem = machine_type["memoryMb"]

        selected = machine_types[smallest]
        logger.debug("Selected machine type {}:{}".format(
            smallest, selected["description"]))

        virtual_machine = {
            "machineType": smallest,
            "labels": {
                "app": "snakemake"
            },
            "bootDiskSizeGb": disk_gb,
            "preemptible": job.rule.name in self.preemptible_rules,
        }

        # If the user wants gpus, add accelerators here
        if gpu_count:
            accelerator = self._get_accelerator(gpu_count,
                                                zone=selected["zone"],
                                                gpu_model=gpu_model)
            virtual_machine["accelerators"] = [{
                "type": accelerator["name"],
                "count": gpu_count
            }]

        resources = {
            "regions": self.regions,
            "virtualMachine": virtual_machine
        }
        return resources
Example #25
0
def notebook(
    path,
    basedir,
    input,
    output,
    params,
    wildcards,
    threads,
    resources,
    log,
    config,
    rulename,
    conda_env,
    container_img,
    singularity_args,
    env_modules,
    bench_record,
    jobid,
    bench_iteration,
    cleanup_scripts,
    shadow_dir,
    edit=None,
):
    """
    Load a script from the given basedir + path and execute it.
    """
    draft = False
    if edit is not None:
        if urlparse(path).scheme == "":
            if not os.path.isabs(path):
                local_path = os.path.join(basedir, path)
            else:
                local_path = path
            if not os.path.exists(local_path):
                # draft the notebook, it does not exist yet
                language = None
                draft = True
                path = "file://{}".format(os.path.abspath(local_path))
                if path.endswith(".py.ipynb"):
                    language = "jupyter_python"
                elif path.endswith(".r.ipynb"):
                    language = "jupyter_r"
                else:
                    raise WorkflowError(
                        "Notebook to edit has to end on .py.ipynb or .r.ipynb in order "
                        "to decide which programming language shall be used.")
        else:
            raise WorkflowError(
                "Notebook {} is not local, but edit mode is only allowed for "
                "local notebooks.".format(path))

    if not draft:
        path, source, language = get_source(path, basedir)
    else:
        source = None

    exec_class = get_exec_class(language)

    executor = exec_class(
        path,
        source,
        basedir,
        input,
        output,
        params,
        wildcards,
        threads,
        resources,
        log,
        config,
        rulename,
        conda_env,
        container_img,
        singularity_args,
        env_modules,
        bench_record,
        jobid,
        bench_iteration,
        cleanup_scripts,
        shadow_dir,
    )

    if draft:
        executor.draft(listen=edit)
    else:
        executor.evaluate(edit=edit)
Example #26
0
import re
import math
import functools
import concurrent.futures

# module-specific
from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider
from snakemake.exceptions import WorkflowError, S3FileException

try:
    # third-party modules
    import boto3
    import botocore
except ImportError as e:
    raise WorkflowError(
        "The Python 3 package 'boto3' "
        "needs to be installed to use S3 remote() file functionality. %s" %
        e.msg)


class RemoteProvider(AbstractRemoteProvider):

    supports_default = True

    def __init__(self,
                 *args,
                 stay_on_remote=False,
                 keep_local=False,
                 is_default=False,
                 **kwargs):
        super(RemoteProvider, self).__init__(*args,
                                             stay_on_remote=stay_on_remote,
Example #27
0
    def __init__(self,
                 workflow,
                 dag,
                 cores,
                 jobname="snakejob.{rulename}.{jobid}.sh",
                 printreason=False,
                 quiet=False,
                 printshellcmds=False,
                 latency_wait=3,
                 benchmark_repeats=1,
                 cluster_config=None):
        super().__init__(workflow,
                         dag,
                         printreason=printreason,
                         quiet=quiet,
                         printshellcmds=printshellcmds,
                         latency_wait=latency_wait,
                         benchmark_repeats=benchmark_repeats)
        if workflow.snakemakepath is None:
            raise ValueError("Cluster executor needs to know the path "
                             "to the snakemake binary.")

        jobscript = workflow.jobscript
        if jobscript is None:
            jobscript = os.path.join(os.path.dirname(__file__),
                                     self.default_jobscript)
        try:
            with open(jobscript) as f:
                self.jobscript = f.read()
        except IOError as e:
            raise WorkflowError(e)

        if not "jobid" in get_wildcard_names(jobname):
            raise WorkflowError(
                "Defined jobname (\"{}\") has to contain the wildcard {jobid}."
            )

        self.exec_job = (
            'cd {workflow.workdir_init} && '
            '{workflow.snakemakepath} --snakefile {workflow.snakefile} '
            '--force -j{cores} --keep-target-files '
            '--wait-for-files {job.input} --latency-wait {latency_wait} '
            '--benchmark-repeats {benchmark_repeats} '
            '{overwrite_workdir} {overwrite_config} --nocolor '
            '--notemp --quiet --no-hooks --nolock {target}')

        if printshellcmds:
            self.exec_job += " --printshellcmds "

        if not any(dag.dynamic_output_jobs):
            # disable restiction to target rule in case of dynamic rules!
            self.exec_job += " --allowed-rules {job.rule.name} "
        self.jobname = jobname
        self._tmpdir = None
        self.cores = cores if cores else ""
        self.cluster_config = cluster_config if cluster_config else dict()

        self.active_jobs = list()
        self.lock = threading.Lock()
        self.wait = True
        self.wait_thread = threading.Thread(target=self._wait_for_jobs)
        self.wait_thread.daemon = True
        self.wait_thread.start()
Example #28
0
 def check_broken_symlink(self):
     """ Raise WorkflowError if file is a broken symlink. """
     if not self.exists_local and lstat(self.file):
         raise WorkflowError("File {} seems to be a broken symlink.".format(
             self.file))
Example #29
0
def get_packages():
    try:
        import pygments
    except ImportError:
        raise WorkflowError(
            "Python package pygments must be installed to create reports.")

    return Packages({
        "snakemake":
        Package(
            version=snakemake.__version__.split("+")[0],
            license_url=
            "https://raw.githubusercontent.com/snakemake/snakemake/main/LICENSE.md",
        ),
        "pygments":
        Package(
            version=pygments.__version__,
            license_url=
            "https://raw.githubusercontent.com/pygments/pygments/master/LICENSE",
        ),
        "tailwindcss":
        Package(
            version="3.0",
            license_url=
            "https://raw.githubusercontent.com/tailwindlabs/tailwindcss/master/LICENSE",
            url=
            "https://cdn.tailwindcss.com/[email protected],[email protected]",
        ),
        "react":
        Package(
            version="17",
            license_url=
            "https://raw.githubusercontent.com/facebook/react/main/LICENSE",
            main="https://unpkg.com/react@17/umd/react.development.js",
            dom="https://unpkg.com/react-dom@17/umd/react-dom.development.js",
        ),
        "vega":
        Package(
            version="5.21",
            url="https://cdnjs.cloudflare.com/ajax/libs/vega/5.21.0/vega.js",
            license_url=
            "https://raw.githubusercontent.com/vega/vega/main/LICENSE",
        ),
        "vega-lite":
        Package(
            version="5.2",
            url=
            "https://cdnjs.cloudflare.com/ajax/libs/vega-lite/5.2.0/vega-lite.js",
            license_url=
            "https://raw.githubusercontent.com/vega/vega-lite/next/LICENSE",
        ),
        "vega-embed":
        Package(
            version="6.20",
            url=
            "https://cdnjs.cloudflare.com/ajax/libs/vega-embed/6.20.8/vega-embed.js",
            license_url=
            "https://raw.githubusercontent.com/vega/vega-embed/next/LICENSE",
        ),
        "heroicons":
        Package(
            version="1.0.6",
            license_url=
            "https://raw.githubusercontent.com/tailwindlabs/heroicons/master/LICENSE",
        ),
    })
Example #30
0
import re
import json
import logging
import xml.etree.ElementTree as ET

# module-specific
from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider
from snakemake.exceptions import WorkflowError, NCBIFileException
from snakemake.logging import logger

try:
    # third-party modules
    from Bio import Entrez
except ImportError as e:
    raise WorkflowError(
        "The Python package 'biopython' needs to be installed to use NCBI Entrez remote() file functionality. %s"
        % e.msg)


class RemoteProvider(AbstractRemoteProvider):
    def __init__(self,
                 *args,
                 keep_local=False,
                 stay_on_remote=False,
                 is_default=False,
                 email=None,
                 **kwargs):
        super(RemoteProvider, self).__init__(*args,
                                             keep_local=keep_local,
                                             stay_on_remote=stay_on_remote,
                                             is_default=is_default,