import os import re import shutil import subprocess as sp from datetime import datetime import time from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider from snakemake.exceptions import WorkflowError from snakemake.common import lazy_property from snakemake.logging import logger if not shutil.which("gfal-copy"): raise WorkflowError( "The gfal-* commands need to be available for " "gfal remote support." ) class RemoteProvider(AbstractRemoteProvider): supports_default = True allows_directories = True def __init__( self, *args, keep_local=False, stay_on_remote=False, is_default=False, retry=5,
def job_selector_ilp(self, jobs): """ Job scheduling by optimization of resource usage by solving ILP using pulp """ import pulp from pulp import lpSum # assert self.resources["_cores"] > 0 scheduled_jobs = { job: pulp.LpVariable( "job_{}".format(idx), lowBound=0, upBound=1, cat=pulp.LpInteger, ) for idx, job in enumerate(jobs) } temp_files = { temp_file for job in jobs for temp_file in self.dag.temp_input(job) } temp_job_improvement = { temp_file: pulp.LpVariable("temp_file_{}".format(idx), lowBound=0, upBound=1, cat="Continuous") for idx, temp_file in enumerate(temp_files) } temp_file_deletable = { temp_file: pulp.LpVariable( "deletable_{}".format(idx), lowBound=0, upBound=1, cat=pulp.LpInteger, ) for idx, temp_file in enumerate(temp_files) } prob = pulp.LpProblem("JobScheduler", pulp.LpMaximize) total_temp_size = max( sum([temp_file.size for temp_file in temp_files]), 1) total_core_requirement = sum( [job.resources.get("_cores", 1) + 1 for job in jobs]) # Objective function # Job priority > Core load # Core load > temp file removal # Instant removal > temp size prob += ( 2 * total_core_requirement * 2 * total_temp_size * lpSum([job.priority * scheduled_jobs[job] for job in jobs]) + 2 * total_temp_size * lpSum([(job.resources.get("_cores", 1) + 1) * scheduled_jobs[job] for job in jobs]) + total_temp_size * lpSum([ temp_file_deletable[temp_file] * temp_file.size for temp_file in temp_files ]) + lpSum([ temp_job_improvement[temp_file] * temp_file.size for temp_file in temp_files ])) # Constraints: for name in self.workflow.global_resources: prob += (lpSum([ scheduled_jobs[job] * job.resources.get(name, 0) for job in jobs ]) <= self.resources[name]) # Choose jobs that lead to "fastest" (minimum steps) removal of existing temp file for temp_file in temp_files: prob += temp_job_improvement[temp_file] <= lpSum([ scheduled_jobs[job] * self.required_by_job(temp_file, job) for job in jobs ]) / lpSum([self.required_by_job(temp_file, job) for job in jobs]) prob += temp_file_deletable[temp_file] <= temp_job_improvement[ temp_file] # disable extensive logging pulp.apis.LpSolverDefault.msg = False try: if self.scheduler_ilp_solver: prob.solve(pulp.get_solver(self.scheduler_ilp_solver)) else: prob.solve() except pulp.apis.core.PulpSolverError as e: raise WorkflowError( "Failed to solve the job scheduling problem with pulp. " "Please report a bug and use --scheduler greedy as a workaround:\n{}" .format(e)) selected_jobs = [ job for job, variable in scheduled_jobs.items() if variable.value() == 1.0 ] for name in self.workflow.global_resources: self.resources[name] -= sum( [job.resources.get(name, 0) for job in selected_jobs]) return selected_jobs
def run_wrapper(run, input, output, params, wildcards, threads, resources, log, version, benchmark, benchmark_repeats, linemaps, debug=False): """ Wrapper around the run method that handles directory creation and output file deletion on error. Arguments run -- the run method input -- list of input files output -- list of output files wildcards -- so far processed wildcards threads -- usable threads log -- list of log files """ if os.name == "posix" and debug: sys.stdin = open('/dev/stdin') try: runs = 1 if benchmark is None else benchmark_repeats wallclock = [] for i in range(runs): w = time.time() # execute the actual run method. run(input, output, params, wildcards, threads, resources, log, version) w = time.time() - w wallclock.append(w) except (KeyboardInterrupt, SystemExit) as e: # re-raise the keyboard interrupt in order to record an error in the scheduler but ignore it raise e except (Exception, BaseException) as ex: # this ensures that exception can be re-raised in the parent thread lineno, file = get_exception_origin(ex, linemaps) raise RuleException( format_error(ex, lineno, linemaps=linemaps, snakefile=file, show_traceback=True)) if benchmark is not None: try: with open(benchmark, "w") as f: print("s", "h:m:s", sep="\t", file=f) for t in wallclock: print(t, str(datetime.timedelta(seconds=t)), sep="\t", file=f) except (Exception, BaseException) as ex: raise WorkflowError(ex)
import functools # intra-module from snakemake.remote.S3 import RemoteObject as S3RemoteObject, RemoteProvider as S3RemoteProvider from snakemake.remote.S3 import S3Helper from snakemake.decorators import dec_all_methods from snakemake.exceptions import WorkflowError from snakemake.logging import logger try: # third-party import boto3 from moto import mock_s3 import filechunkio except ImportError as e: raise WorkflowError("The Python 3 packages 'moto', boto' and 'filechunkio' " + "need to be installed to use S3Mocked remote() file functionality. %s" % e.msg) def noop(): pass def pickled_moto_wrapper(func): """ This is a class decorator that in turn decorates all methods within a class to mock out boto calls with moto-simulated ones. Since the moto backends are not presistent across calls by default, the wrapper also pickles the bucket state after each function call, and restores it before execution. This way uploaded files are available for follow-on tasks. Since snakemake may execute with multiple threads it also waits for the pickled bucket state file to be available before loading it in. This is a hackey alternative to using proper locks, but works ok in practice.
__email__ = "*****@*****.**" __license__ = "MIT" import os # module-specific from snakemake.remote import AbstractRemoteProvider, AbstractRemoteObject from snakemake.exceptions import DropboxFileException, WorkflowError from snakemake.utils import os_sync try: # third-party modules import dropbox # The official Dropbox API library except ImportError as e: raise WorkflowError("The Python 3 package 'dropbox' " "must be installed to use Dropbox remote() file " "functionality. %s" % e.msg) class RemoteProvider(AbstractRemoteProvider): def __init__(self, *args, keep_local=False, stay_on_remote=False, is_default=False, **kwargs): super(RemoteProvider, self).__init__(*args, keep_local=keep_local, stay_on_remote=stay_on_remote, is_default=is_default, **kwargs)
from pytz import timezone # module-specific from snakemake.remote import AbstractRemoteProvider, AbstractRemoteObject from snakemake.exceptions import WorkflowError try: # third-party modules from irods.session import iRODSSession from irods.meta import iRODSMeta from irods.models import DataObject from irods.exception import CollectionDoesNotExist, DataObjectDoesNotExist import irods.keywords as kw except ImportError as e: raise WorkflowError( "The Python 3 package 'python-irodsclient' " + "must be installed to use iRODS remote() file functionality. %s" % e.msg) utc = datetime.utcfromtimestamp(0) def _irods_session(*args, **kwargs): try: irods_env_file = os.environ["IRODS_ENVIRONMENT_FILE"] except KeyError: irods_env_file = kwargs.get( "irods_env_file", os.path.expanduser("~/.irods/irods_environment.json")) if not os.path.isfile(irods_env_file): raise WorkflowError(
def auto_report(dag, path, stylesheet=None): try: from jinja2 import Template, Environment, PackageLoader except ImportError as e: raise WorkflowError( "Python package jinja2 must be installed to create reports.") mode_embedded = True if path.endswith(".zip"): mode_embedded = False elif not path.endswith(".html"): raise WorkflowError("Report file does not end with .html or .zip") custom_stylesheet = None if stylesheet is not None: try: with open(stylesheet) as s: custom_stylesheet = s.read() except (Exception, BaseException) as e: raise WorkflowError("Unable to read custom report stylesheet.", e) logger.info("Creating report...") env = Environment( loader=PackageLoader("snakemake", "report"), trim_blocks=True, lstrip_blocks=True, ) env.filters["get_resource_as_string"] = get_resource_as_string persistence = dag.workflow.persistence results = defaultdict(lambda: defaultdict(list)) records = defaultdict(JobRecord) recorded_files = set() for job in dag.jobs: for f in itertools.chain(job.expanded_output, job.input): if is_flagged(f, "report") and f not in recorded_files: if not f.exists: raise WorkflowError("File {} marked for report but does " "not exist.".format(f)) report_obj = get_flag_value(f, "report") def register_file(f, wildcards_overwrite=None): wildcards = wildcards_overwrite or job.wildcards category = Category(report_obj.category, wildcards=wildcards, job=job) subcategory = Category(report_obj.subcategory, wildcards=wildcards, job=job) results[category][subcategory].append( FileRecord( f, job, report_obj.caption, env, category, wildcards_overwrite=wildcards_overwrite, mode_embedded=mode_embedded, )) recorded_files.add(f) if os.path.isfile(f): register_file(f) if os.path.isdir(f): if not isinstance(report_obj.patterns, list): raise WorkflowError( "Invalid patterns given for report. Must be list.", rule=job.rule, ) if not report_obj.patterns: raise WorkflowError( "Directory marked for report but no file patterns given via patterns=[...]. " "See report documentation.", rule=job.rule, ) for pattern in report_obj.patterns: pattern = os.path.join(f, pattern) wildcards = glob_wildcards(pattern)._asdict() names = wildcards.keys() for w in zip(*wildcards.values()): w = dict(zip(names, w)) w.update(job.wildcards_dict) w = Wildcards(fromdict=w) f = apply_wildcards(pattern, w) register_file(f, wildcards_overwrite=w) for f in job.expanded_output: meta = persistence.metadata(f) if not meta: logger.warning("Missing metadata for file {}. Maybe metadata " "was deleted or it was created using an older " "version of Snakemake. This is a non critical " "warning.".format(f)) continue try: job_hash = meta["job_hash"] rule = meta["rule"] rec = records[(job_hash, rule)] rec.rule = rule rec.job = job rec.starttime = min(rec.starttime, meta["starttime"]) rec.endtime = max(rec.endtime, meta["endtime"]) rec.conda_env_file = None rec.conda_env = meta["conda_env"] rec.container_img_url = meta["container_img_url"] rec.output.append(f) except KeyError as e: print(e) logger.warning("Metadata for file {} was created with a too " "old Snakemake version.".format(f)) for subcats in results.values(): for catresults in subcats.values(): catresults.sort(key=lambda res: res.name) # prepare runtimes runtimes = [{ "rule": rec.rule, "runtime": rec.endtime - rec.starttime } for rec in sorted(records.values(), key=lambda rec: rec.rule)] # prepare end times timeline = [{ "rule": rec.rule, "starttime": datetime.datetime.fromtimestamp(rec.starttime).isoformat(), "endtime": datetime.datetime.fromtimestamp(rec.endtime).isoformat(), } for rec in sorted(records.values(), key=lambda rec: rec.rule)] # prepare per-rule information rules = defaultdict(list) for rec in records.values(): rule = RuleRecord(rec.job, rec) if rec.rule not in rules: rules[rec.rule].append(rule) else: merged = False for other in rules[rec.rule]: if rule == other: other.add(rec) merged = True break if not merged: rules[rec.rule].append(rule) # rulegraph rulegraph, xmax, ymax = rulegraph_d3_spec(dag) # configfiles configfiles = [ConfigfileRecord(f) for f in dag.workflow.configfiles] seen = set() files = [ seen.add(res.target) or res for cat in results.values() for subcat in cat.values() for res in subcat if res.target not in seen ] rst_links = textwrap.dedent(""" .. _Workflow: javascript:show_panel('workflow') .. _Statistics: javascript:show_panel('statistics') {% for cat, catresults in categories|dictsort %} .. _{{ cat.name }}: javascript:show_panel("{{ cat.id }}") {% endfor %} {% for res in files %} .. _{{ res.target }}: javascript:show_panel("{{ res.category.id }}") {% endfor %} """) for cat, subcats in results.items(): for subcat, catresults in subcats.items(): for res in catresults: res.render(env, rst_links, results, files) # global description text = "" if dag.workflow.report_text: with open(dag.workflow.report_text) as f: class Snakemake: config = dag.workflow.config text = f.read() + rst_links text = publish_parts( env.from_string(text).render(snakemake=Snakemake, categories=results, files=files), writer_name="html", )["body"] # record time now = "{} {}".format(datetime.datetime.now().ctime(), time.tzname[0]) results_size = sum(res.size for cat in results.values() for subcat in cat.values() for res in subcat) try: from pygments.formatters import HtmlFormatter except ImportError: raise WorkflowError( "Python package pygments must be installed to create reports.") template = env.get_template("report.html") logger.info("Downloading resources and rendering HTML.") rendered = template.render( results=results, results_size=results_size, configfiles=configfiles, text=text, rulegraph_nodes=rulegraph["nodes"], rulegraph_links=rulegraph["links"], rulegraph_width=xmax + 20, rulegraph_height=ymax + 20, runtimes=runtimes, timeline=timeline, rules=[rec for recs in rules.values() for rec in recs], version=__version__, now=now, pygments_css=HtmlFormatter(style="trac").get_style_defs(".source"), custom_stylesheet=custom_stylesheet, mode_embedded=mode_embedded, ) # TODO look into supporting .WARC format, also see (https://webrecorder.io) if not mode_embedded: with ZipFile(path, mode="w") as zipout: folder = Path(Path(path).stem) # store results in data folder for subcats in results.values(): for catresults in subcats.values(): for result in catresults: # write raw data if result.table_content is not None: zipout.writestr( str(folder.joinpath(result.data_uri)), result.table_content, ) else: zipout.write(result.path, str(folder.joinpath(result.data_uri))) # write thumbnail if result.is_img and result.png_content: zipout.writestr( str(folder.joinpath(result.png_uri)), result.png_content) # write report html zipout.writestr(str(folder.joinpath("report.html")), rendered) else: with open(path, "w", encoding="utf-8") as htmlout: htmlout.write(rendered) logger.info("Report created: {}.".format(path))
import time from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider from snakemake.exceptions import WorkflowError, CheckSumMismatchException from snakemake.common import lazy_property import snakemake.io from snakemake.utils import os_sync try: import google.cloud from google.cloud import storage from google.api_core import retry from google_crc32c import Checksum except ImportError as e: raise WorkflowError( "The Python 3 packages 'google-cloud-sdk' and `google-crc32c` " "need to be installed to use GS remote() file functionality. %s" % e.msg) def google_cloud_retry_predicate(ex): """Given an exception from Google Cloud, determine if it's one in the listing of transient errors (determined by function google.api_core.retry.if_transient_error(exception)) or determine if triggered by a hash mismatch due to a bad download. This function will return a boolean to indicate if retry should be done, and is typically used with the google.api_core.retry.Retry as a decorator (predicate). Arguments: ex (Exception) : the exception passed from the decorated function Returns: boolean to indicate doing retry (True) or not (False) """
def parse(self): m = re.search("(?P<bucket>[^/]*)/(?P<key>.*)", self.local_file()) if len(m.groups()) != 2: raise WorkflowError("GS remote file {} does not have the form " "<bucket>/<key>.".format(self.local_file())) return m
def _apply_wildcards( self, newitems, olditems, wildcards, concretize=None, check_return_type=True, omit_callable=False, mapping=None, no_flattening=False, aux_params=None, apply_path_modifier=True, property=None, incomplete_checkpoint_func=lambda e: None, allow_unpack=True, ): if aux_params is None: aux_params = dict() for name, item in olditems._allitems(): start = len(newitems) is_unpack = is_flagged(item, "unpack") _is_callable = is_callable(item) if _is_callable: if omit_callable: continue item, incomplete = self.apply_input_function( item, wildcards, incomplete_checkpoint_func=incomplete_checkpoint_func, is_unpack=is_unpack, **aux_params ) if apply_path_modifier: item = self.apply_path_modifier(item, property=property) if is_unpack and not incomplete: if not allow_unpack: raise WorkflowError( "unpack() is not allowed with params. " "Simply return a dictionary which can be directly ." "used, e.g. via {params[mykey]}." ) # Sanity checks before interpreting unpack() if not isinstance(item, (list, dict)): raise WorkflowError( "Can only use unpack() on list and dict", rule=self ) if name: raise WorkflowError( "Cannot combine named input file with unpack()", rule=self ) # Allow streamlined code with/without unpack if isinstance(item, list): pairs = zip([None] * len(item), item) else: assert isinstance(item, dict) pairs = item.items() else: pairs = [(name, item)] for name, item in pairs: is_iterable = True if not_iterable(item) or no_flattening: item = [item] is_iterable = False for item_ in item: if ( check_return_type and not isinstance(item_, str) and not isinstance(item_, Path) ): raise WorkflowError( "Function did not return str or list " "of str.", rule=self ) concrete = concretize(item_, wildcards, _is_callable) newitems.append(concrete) if mapping is not None: mapping[concrete] = item_ if name: newitems._set_name( name, start, end=len(newitems) if is_iterable else None ) start = len(newitems)
def job_to_cwl(job, dag, outputs, inputs): """Convert a job with its dependencies to a CWL workflow step.""" if job.dynamic_output: raise WorkflowError( "Dynamic output is not supported by CWL conversion.") for f in job.output: if os.path.isabs(f): raise WorkflowError("All output files have to be relative to the " "working directory.") get_output_id = lambda job, i: "#main/job-{}/{}".format(job.jobid, i) dep_ids = { o: get_output_id(dep, i) for dep, files in dag.dependencies[job].items() for i, o in enumerate(dep.output) if o in files } files = [f for f in job.input if f not in dep_ids] if job.conda_env_file: files.add(os.path.relpath(job.conda_env_file)) out = [get_output_id(job, i) for i, _ in enumerate(job.output)] def workdir_entry(i, f): location = "??inputs.input_files[{}].location??".format(i) if f.is_directory: entry = { "class": "Directory", "basename": os.path.basename(f), "location": location, } else: entry = { "class": "File", "basename": os.path.basename(f), "location": location, } return "$({})".format( json.dumps(outer_entry(f, entry)).replace('"??', "").replace( '??"', "")).replace('"', "'") def outer_entry(f, entry): parent = os.path.dirname(f) if parent: return outer_entry( parent, { "class": "Directory", "basename": os.path.basename(parent), "listing": [entry], }, ) else: return entry if job in dag.targetjobs: # TODO this maps output files into the cwd after the workflow is complete. # We need to find a way to define subdirectories though. Otherwise, # there can be name clashes, and it will also become very crowded. outputs.append({ "type": { "type": "array", "items": "File" }, "outputSource": "#main/job-{}/output_files".format(job.jobid), "id": "#main/output/job-{}".format(job.jobid), }) cwl = { "run": "#snakemake-job", "requirements": { "InitialWorkDirRequirement": { "listing": [{ "writable": True, "entry": workdir_entry(i, f) } for i, f in enumerate( chain( files, (f for dep in dag.dependencies[job] for f in dep.output), ))] } }, "in": { "cores": { "default": job.threads }, "target_files": { "default": job.output._plainstrings() }, "rules": { "default": [job.rule.name] }, }, "out": ["output_files"], "id": "#main/job-{}".format(job.jobid), } if files: inputs.append({ "type": { "type": "array", "items": "File" }, "default": [{ "class": "File", "location": f } for f in files], "id": "#main/input/job-{}".format(job.jobid), }) input_files = [] if files: input_files.append("#main/input/job-{}".format(job.jobid)) input_files.extend("#main/job-{}/output_files".format(dep.jobid) for dep in dag.dependencies[job]) cwl["in"]["input_files"] = { "source": input_files, "linkMerge": "merge_flattened" } return cwl
def _set_inoutput_item(self, item, output=False, name=None): """ Set an item to be input or output. Arguments item -- the item inoutput -- a Namedlist of either input or output items name -- an optional name for the item """ inoutput = self.output if output else self.input # Check to see if the item is a path, if so, just make it a string if isinstance(item, Path): item = str(item) if isinstance(item, str): if ON_WINDOWS: if isinstance(item, (_IOFile, AnnotatedString)): item = item.new_from(item.replace(os.sep, os.altsep)) else: item = item.replace(os.sep, os.altsep) rule_dependency = None if isinstance(item, _IOFile) and item.rule and item in item.rule.output: rule_dependency = item.rule item = self.apply_path_modifier( item, property="output" if output else "input" ) # Check to see that all flags are valid # Note that "remote", "dynamic", and "expand" are valid for both inputs and outputs. if isinstance(item, AnnotatedString): for flag in item.flags: if not output and flag in [ "protected", "temp", "temporary", "directory", "touch", "pipe", ]: logger.warning( "The flag '{}' used in rule {} is only valid for outputs, not inputs.".format( flag, self ) ) if output and flag in ["ancient"]: logger.warning( "The flag '{}' used in rule {} is only valid for inputs, not outputs.".format( flag, self ) ) # add the rule to the dependencies if rule_dependency is not None: self.dependencies[item] = rule_dependency if output: item = self._update_item_wildcard_constraints(item) else: if ( contains_wildcard_constraints(item) and self.workflow.mode != Mode.subprocess ): logger.warning( "Wildcard constraints in inputs are ignored. (rule: {})".format( self ) ) if self.workflow.all_temp and output: # mark as temp if all output files shall be marked as temp item = snakemake.io.flag(item, "temp") # record rule if this is an output file output _item = IOFile(item, rule=self) if is_flagged(item, "temp"): if output: self.temp_output.add(_item) if is_flagged(item, "protected"): if output: self.protected_output.add(_item) if is_flagged(item, "touch"): if output: self.touch_output.add(_item) if is_flagged(item, "dynamic"): if output: self.dynamic_output.add(_item) else: self.dynamic_input.add(_item) if is_flagged(item, "report"): report_obj = item.flags["report"] if report_obj.caption is not None: r = ReportObject( self.workflow.current_basedir.join(report_obj.caption), report_obj.category, report_obj.subcategory, report_obj.patterns, report_obj.htmlindex, ) item.flags["report"] = r if is_flagged(item, "subworkflow"): if output: raise SyntaxError("Only input files may refer to a subworkflow") else: # record the workflow this item comes from sub = item.flags["subworkflow"] if _item in self.subworkflow_input: other = self.subworkflow_input[_item] if sub != other: raise WorkflowError( "The input file {} is ambiguously " "associated with two subworkflows " "{} and {}.".format(item, sub, other), rule=self, ) self.subworkflow_input[_item] = sub inoutput.append(_item) if name: inoutput._add_name(name) elif callable(item): if output: raise SyntaxError("Only input files can be specified as functions") inoutput.append(item) if name: inoutput._add_name(name) else: try: start = len(inoutput) for i in item: self._set_inoutput_item(i, output=output) if name: # if the list was named, make it accessible inoutput._set_name(name, start, end=len(inoutput)) except TypeError: raise SyntaxError( "Input and output files have to be specified as strings or lists of strings." )
def version(self, version): if isinstance(version, str) and "\n" in version: raise WorkflowError( "Version string may not contain line breaks.", rule=self ) self._version = version
def create_archive(self): """Create self-contained archive of environment.""" from snakemake.shell import shell try: import yaml except ImportError: raise WorkflowError("Error importing PyYAML. " "Please install PyYAML to archive workflows.") # importing requests locally because it interferes with instantiating conda environments import requests env_archive = self.archive_file if os.path.exists(env_archive): return env_archive try: # Download logger.info( "Downloading packages for conda environment {}...".format( self.file)) os.makedirs(env_archive, exist_ok=True) try: out = shell.check_output( "conda list --explicit --prefix '{}'".format(self.path), stderr=subprocess.STDOUT, ) logger.debug(out.decode()) except subprocess.CalledProcessError as e: raise WorkflowError("Error exporting conda packages:\n" + e.output.decode()) with open(os.path.join(env_archive, "packages.txt"), "w") as pkg_list: for l in out.decode().split("\n"): if l and not l.startswith("#") and not l.startswith("@"): pkg_url = l logger.info(pkg_url) parsed = urlparse(pkg_url) pkg_name = os.path.basename(parsed.path) # write package name to list print(pkg_name, file=pkg_list) # download package pkg_path = os.path.join(env_archive, pkg_name) with open(pkg_path, "wb") as copy: r = requests.get(pkg_url) r.raise_for_status() copy.write(r.content) try: tarfile.open(pkg_path) except: raise WorkflowError( "Package is invalid tar archive: {}".format( pkg_url)) except ( requests.exceptions.ChunkedEncodingError, requests.exceptions.HTTPError, ) as e: shutil.rmtree(env_archive) raise WorkflowError( "Error downloading conda package {}.".format(pkg_url)) except (Exception, BaseException) as e: shutil.rmtree(env_archive) raise e return env_archive
__email__ = "*****@*****.**" __license__ = "MIT" import os from contextlib import contextmanager # module-specific from snakemake.remote import AbstractRemoteProvider, DomainObject from snakemake.exceptions import SFTPFileException, WorkflowError try: # third-party modules import pysftp except ImportError as e: raise WorkflowError( "The Python 3 package 'pysftp' " + "must be installed to use SFTP remote() file functionality. %s" % e.msg) class RemoteProvider(AbstractRemoteProvider): supports_default = True allows_directories = True def __init__(self, *args, keep_local=False, stay_on_remote=False, is_default=False, mkdir_remote=True, **kwargs):
def expand(*args, **wildcards): """ Expand wildcards in given filepatterns. Arguments *args -- first arg: filepatterns as list or one single filepattern, second arg (optional): a function to combine wildcard values (itertools.product per default) **wildcards -- the wildcards as keyword arguments with their values as lists. If allow_missing=True is included wildcards in filepattern without values will stay unformatted. """ filepatterns = args[0] if len(args) == 1: combinator = product elif len(args) == 2: combinator = args[1] if isinstance(filepatterns, str) or isinstance(filepatterns, Path): filepatterns = [filepatterns] def path_to_str(f): if isinstance(f, Path): return str(f) return f filepatterns = list(map(path_to_str, filepatterns)) if any(map(lambda f: getattr(f, "flags", {}), filepatterns)): raise WorkflowError( "Flags in file patterns given to expand() are invalid. " "Flags (e.g. temp(), directory()) have to be applied outside " "of expand (e.g. 'temp(expand(\"plots/{sample}.pdf\", sample=SAMPLES))')." ) # check if remove missing is provided format_dict = dict if "allow_missing" in wildcards and wildcards["allow_missing"] is True: class FormatDict(dict): def __missing__(self, key): return "{" + key + "}" format_dict = FormatDict # check that remove missing is not a wildcard in the filepatterns for filepattern in filepatterns: if "allow_missing" in re.findall(r"{([^}\.[!:]+)", filepattern): format_dict = dict break # remove unused wildcards to avoid duplicate filepatterns wildcards = { filepattern: { k: v for k, v in wildcards.items() if k in re.findall(r"{([^}\.[!:]+)", filepattern) } for filepattern in filepatterns } def flatten(wildcards): for wildcard, values in wildcards.items(): if isinstance( values, str) or not isinstance(values, collections.abc.Iterable): values = [values] yield [(wildcard, value) for value in values] formatter = string.Formatter() try: return [ formatter.vformat(filepattern, (), comb) for filepattern in filepatterns for comb in map( format_dict, combinator(*flatten(wildcards[filepattern]))) ] except KeyError as e: raise WildcardError("No values given for wildcard {}.".format(e))
import concurrent.futures # snakemake specific from snakemake.common import lazy_property # module specific from snakemake.exceptions import WorkflowError, AzureFileException from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider # service provider support try: from azure.storage.common.cloudstorageaccount import ( CloudStorageAccount as AzureStorageAccount, ) except ImportError as e: raise WorkflowError( "The Python 3 packages 'azure-storage' and 'azure-storage-common' " "need to be installed to use Azure Storage remote() file functionality. %s" % e.msg) class RemoteProvider(AbstractRemoteProvider): supports_default = True def __init__(self, *args, keep_local=False, stay_on_remote=False, is_default=False, **kwargs): super(RemoteProvider, self).__init__(*args, keep_local=keep_local,
def report(text, path, stylesheet=None, defaultenc="utf8", template=None, metadata=None, **files): """Create an HTML report using python docutils. This is deprecated in favor of the --report flag. Attention: This function needs Python docutils to be installed for the python installation you use with Snakemake. All keywords not listed below are intepreted as paths to files that shall be embedded into the document. They keywords will be available as link targets in the text. E.g. append a file as keyword arg via F1=input[0] and put a download link in the text like this: .. code:: python report(''' ============== Report for ... ============== Some text. A link to an embedded file: F1_. Further text. ''', outputpath, F1=input[0]) Instead of specifying each file as a keyword arg, you can also expand the input of your rule if it is completely named, e.g.: report(''' Some text... ''', outputpath, **input) Args: text (str): The "restructured text" as it is expected by python docutils. path (str): The path to the desired output file stylesheet (str): An optional path to a css file that defines the style of the document. This defaults to <your snakemake install>/report.css. Use the default to get a hint how to create your own. defaultenc (str): The encoding that is reported to the browser for embedded text files, defaults to utf8. template (str): An optional path to a docutils HTML template. metadata (str): E.g. an optional author name or email address. """ if stylesheet is None: os.path.join(os.path.dirname(__file__), "report.css") try: import snakemake.report except ImportError: raise WorkflowError( "Python 3 package docutils needs to be installed to use the report function." ) snakemake.report.report(text, path, stylesheet=stylesheet, defaultenc=defaultenc, template=template, metadata=metadata, **files)
def get_resource_as_string(url): r = requests.get(url) if r.status_code == requests.codes.ok: return r.text raise WorkflowError("Failed to download resource needed for " "report: {}".format(url))
def _get_provenance_hash(self, job: Job): """ Recursively calculate hash for the output of the given job and all upstream jobs in a blockchain fashion. This is based on an idea of Sven Nahnsen. Fails if job has more than one output file. The reason is that there is no way to generate a per-output file hash without generating the files. This hash, however, shall work without having to generate the files, just by describing all steps down to a given job. """ if job in self._hashes: return self._hashes[job] workflow = job.dag.workflow h = hashlib.sha256() # Hash shell command or script. if job.is_shell: # We cannot use the formatted shell command, because it also contains threads, # resources, and filenames (which shall be irrelevant for the hash). h.update(job.rule.shellcmd.encode()) elif job.is_script: _, source, _ = script.get_source( job.rule.script, basedir=job.rule.basedir, wildcards=job.wildcards, params=job.params, ) h.update(source) elif job.is_notebook: _, source, _ = script.get_source( job.rule.notebook, basedir=job.rule.basedir, wildcards=job.wildcards, params=job.params, ) h.update(source) elif job.is_wrapper: _, source, _ = script.get_source( wrapper.get_script(job.rule.wrapper, prefix=workflow.wrapper_prefix), basedir=job.rule.basedir, wildcards=job.wildcards, params=job.params, ) h.update(source) # Hash params. for key, value in sorted(job.params._allitems()): if key is not None: h.update(key.encode()) # If this raises a TypeError, we cannot calculate a reliable hash. try: h.update(json.dumps(value, sort_keys=True).encode()) except TypeError as e: raise WorkflowError( "Rule {} cannot be cached, because params " "are not JSON serializable. " "Consider converting them into a suitable format " "if you are sure that caching is necessary. " "Otherwise, deactivate caching for this rule " "by removing it from the --cache command line argument " "or removing the cache: true directive from the rule itself." .format(job.rule.name), e, ) # Hash input files that are not generated by other jobs (sorted by hash value). for file_hash in sorted( hash_file(f) for f in job.input if not any( f in depfiles for depfiles in job.dag.dependencies[job].values())): h.update(file_hash.encode()) # Hash used containers or conda environments. if workflow.use_conda and job.conda_env: if workflow.use_singularity and job.conda_env.container_img_url: h.update(job.conda_env.container_img_url.encode()) h.update(job.conda_env.content) elif workflow.use_singularity and job.container_img_url: h.update(job.container_img_url.encode()) # Generate hashes of dependencies, and add them in a blockchain fashion (as input to the current hash, sorted by hash value). for dep_hash in sorted( self._get_provenance_hash(dep) for dep in set(job.dag.dependencies[job].keys())): h.update(dep_hash.encode()) provenance_hash = h.hexdigest() # Store for re-use. self._hashes[job] = provenance_hash return provenance_hash
def __init__(self, rule, dag, wildcards_dict=None, format_wildcards=None, targetfile=None): self.rule = rule self.dag = dag # the targetfile that led to the job # it is important to record this, since we need it to submit the # job on a cluster. In contrast, an arbitrary targetfile could # lead to a different composition of wildcard values (in case of # ambiguity in matching). self.targetfile = targetfile self.wildcards_dict = wildcards_dict self.wildcards = Wildcards(fromdict=self.wildcards_dict) self._format_wildcards = (self.wildcards if format_wildcards is None else Wildcards(fromdict=format_wildcards)) self.input, input_mapping, self.dependencies = self.rule.expand_input( self.wildcards_dict) self.output, output_mapping = self.rule.expand_output( self.wildcards_dict) # other properties are lazy to be able to use additional parameters and check already existing files self._params = None self._log = None self._benchmark = None self._resources = None self._conda_env_file = None self._conda_env = None self._group = None self.shadow_dir = None self._inputsize = None self.is_updated = False self._attempt = self.dag.workflow.attempt # TODO get rid of these self.dynamic_output, self.dynamic_input = set(), set() self.temp_output, self.protected_output = set(), set() self.touch_output = set() self.subworkflow_input = dict() for f in self.output: f_ = output_mapping[f] if f_ in self.rule.dynamic_output: self.dynamic_output.add(f) if f_ in self.rule.temp_output: self.temp_output.add(f) if f_ in self.rule.protected_output: self.protected_output.add(f) if f_ in self.rule.touch_output: self.touch_output.add(f) for f in self.input: f_ = input_mapping[f] if f_ in self.rule.dynamic_input: self.dynamic_input.add(f) if f_ in self.rule.subworkflow_input: self.subworkflow_input[f] = self.rule.subworkflow_input[f_] elif "subworkflow" in f.flags: sub = f.flags["subworkflow"] if f in self.subworkflow_input: other = self.subworkflow_input[f] if sub != other: raise WorkflowError( "The input file {} is ambiguously " "associated with two subworkflows {} " "and {}.".format(f, sub, other), rule=self.rule) self.subworkflow_input[f] = sub self._hash = self.rule.__hash__() for wildcard_value in self.wildcards_dict.values(): self._hash ^= wildcard_value.__hash__()
def _set_location(self, location=None): """The location is where the Google Life Sciences API is located. This can be meaningful if the requester has data residency requirements or multi-zone needs. To determine this value, we first use the locations API to determine locations available, and then compare them against: 1. user specified location or prefix 2. regions having the same prefix 3. if cannot be satisifed, we throw an error. """ # Derive available locations # See https://cloud.google.com/life-sciences/docs/concepts/locations locations = (self._api.projects().locations().list( name="projects/{}".format(self.project)).execute()) locations = { x["locationId"]: x["name"] for x in locations.get("locations", []) } # Alert the user about locations available logger.debug("locations-available:\n%s" % "\n".join(locations)) # If no locations, there is something wrong if not locations: raise WorkflowError( "No locations found for Google Life Sciences API.") # First pass, attempt to match the user-specified location (or prefix) if location: if location in locations: self.location = locations[location] return # It could be that a prefix was provided for contender in locations: if contender.startswith(location): self.location = locations[contender] return # If we get here and no match, alert user. raise WorkflowError( "Location or prefix requested %s is not available." % location) # If we get here, we need to select location from regions for region in self.regions: if region in locations: self.location = locations[region] return # If we get here, choose based on prefix prefixes = set([r.split("-")[0] for r in self.regions]) regexp = "^(%s)" % "|".join(prefixes) for location in locations: if re.search(regexp, location): self.location = locations[location] return # If we get here, total failure of finding location raise WorkflowError( " No locations available for regions!" " Please specify a location with --google-lifesciences-location " " or extend --google-lifesciences-regions to find a Life Sciences location." )
def validate(data, schema, set_default=True): """Validate data with JSON schema at given path. Args: data (object): data to validate. Can be a config dict or a pandas data frame. schema (str): Path to JSON schema used for validation. The schema can also be in YAML format. If validating a pandas data frame, the schema has to describe a row record (i.e., a dict with column names as keys pointing to row values). See http://json-schema.org. The path is interpreted relative to the Snakefile when this function is called. set_default (bool): set default values defined in schema. See http://python-jsonschema.readthedocs.io/en/latest/faq/ for more information """ try: import jsonschema from jsonschema import validators, RefResolver except ImportError: raise WorkflowError( "The Python 3 package jsonschema must be installed " "in order to use the validate directive.") if not os.path.isabs(schema): frame = inspect.currentframe().f_back # if workflow object is not available this has not been started from a workflow if "workflow" in frame.f_globals: workflow = frame.f_globals["workflow"] schema = os.path.join(workflow.current_basedir, schema) schemafile = schema schema = _load_configfile(schema, filetype="Schema") resolver = RefResolver( urljoin("file:", schemafile), schema, handlers={ "file": lambda uri: _load_configfile(re.sub("^file://", "", uri)) }, ) # Taken from http://python-jsonschema.readthedocs.io/en/latest/faq/ def extend_with_default(validator_class): validate_properties = validator_class.VALIDATORS["properties"] def set_defaults(validator, properties, instance, schema): for property, subschema in properties.items(): if "default" in subschema: instance.setdefault(property, subschema["default"]) for error in validate_properties(validator, properties, instance, schema): yield error return validators.extend(validator_class, {"properties": set_defaults}) Validator = validators.validator_for(schema) if Validator.META_SCHEMA["$schema"] != schema["$schema"]: logger.warning( "No validator found for JSON Schema version identifier '{}'". format(schema["$schema"])) logger.warning( "Defaulting to validator for JSON Schema version '{}'".format( Validator.META_SCHEMA["$schema"])) logger.warning("Note that schema file may not be validated correctly.") DefaultValidator = extend_with_default(Validator) if not isinstance(data, dict): try: import pandas as pd recordlist = [] if isinstance(data, pd.DataFrame): for i, record in enumerate(data.to_dict("records")): record = { k: v for k, v in record.items() if not pd.isnull(v) } try: if set_default: DefaultValidator( schema, resolver=resolver).validate(record) recordlist.append(record) else: jsonschema.validate(record, schema, resolver=resolver) except jsonschema.exceptions.ValidationError as e: raise WorkflowError( "Error validating row {} of data frame.".format(i), e) if set_default: newdata = pd.DataFrame(recordlist, data.index) newcol = ~newdata.columns.isin(data.columns) n = len(data.columns) for col in newdata.loc[:, newcol].columns: data.insert(n, col, newdata.loc[:, col]) n = n + 1 return except ImportError: pass raise WorkflowError("Unsupported data type for validation.") else: try: if set_default: DefaultValidator(schema, resolver=resolver).validate(data) else: jsonschema.validate(data, schema, resolver=resolver) except jsonschema.exceptions.ValidationError as e: raise WorkflowError("Error validating config file.", e)
def _generate_job_resources(self, job): """given a particular job, generate the resources that it needs, including default regions and the virtual machine configuration """ # Right now, do a best effort mapping of resources to instance types cores = job.resources.get("_cores", 1) mem_mb = job.resources.get("mem_mb", 15360) # IOPS performance proportional to disk size disk_mb = job.resources.get("disk_mb", 512000) # Convert mb to gb disk_gb = math.ceil(disk_mb / 1024) # Look for if the user wants an nvidia gpu gpu_count = job.resources.get("nvidia_gpu") or job.resources.get("gpu") gpu_model = job.resources.get("gpu_model") # If a gpu model is specified without a count, we assume 1 if gpu_model and not gpu_count: gpu_count = 1 # Update default resources using decided memory and disk self.workflow.default_resources = self.default_resources self.workflow.default_resources.args = [ "mem_mb=%s" % mem_mb, "disk_mb=%s" % disk_mb, ] self.workflow.default_resources.parsed["mem_mb"] = mem_mb self.workflow.default_resources.parsed["disk_mb"] = disk_mb # Job resource specification can be overridden by gpu preferences self.machine_type_prefix = job.resources.get("machine_type") # If gpu wanted, limit to N1 general family, and update arguments if gpu_count: self._add_gpu(gpu_count) machine_types = self.get_available_machine_types() # Alert the user of machine_types available before filtering # https://cloud.google.com/compute/docs/machine-types logger.debug( "found {} machine types across regions {} before filtering " "to increase selection, define fewer regions".format( len(machine_types), self.regions)) # First pass - eliminate anything that too low in cpu/memory keepers = dict() # Also keep track of max cpus and memory, in case none available max_cpu = 1 max_mem = 15360 for name, machine_type in machine_types.items(): max_cpu = max(max_cpu, machine_type["guestCpus"]) max_mem = max(max_mem, machine_type["memoryMb"]) if machine_type["guestCpus"] < cores or machine_type[ "memoryMb"] < mem_mb: continue keepers[name] = machine_type # If a prefix is set, filter down to it if self.machine_type_prefix: machine_types = keepers keepers = dict() for name, machine_type in machine_types.items(): if name.startswith(self.machine_type_prefix): keepers[name] = machine_type # If we don't have any contenders, workflow error if not keepers: if self.machine_type_prefix: raise WorkflowError( "Machine prefix {prefix} is too strict, or the resources cannot " " be satisfied, so there are no options " "available.".format(prefix=self.machine_type_prefix)) else: raise WorkflowError( "You requested {requestMemory} MB memory, {requestCpu} cores. " "The maximum available are {availableMemory} MB memory and " "{availableCpu} cores. These resources cannot be satisfied. " "Please consider reducing the resource requirements of the " "corresponding rule.".format( requestMemory=mem_mb, requestCpu=cores, availableCpu=max_cpu, availableMemory=max_mem, )) # Now find (quasi) minimal to satisfy constraints machine_types = keepers # Select the first as the "smallest" smallest = list(machine_types.keys())[0] min_cores = machine_types[smallest]["guestCpus"] min_mem = machine_types[smallest]["memoryMb"] for name, machine_type in machine_types.items(): if (machine_type["guestCpus"] < min_cores and machine_type["memoryMb"] < min_mem): smallest = name min_cores = machine_type["guestCpus"] min_mem = machine_type["memoryMb"] selected = machine_types[smallest] logger.debug("Selected machine type {}:{}".format( smallest, selected["description"])) virtual_machine = { "machineType": smallest, "labels": { "app": "snakemake" }, "bootDiskSizeGb": disk_gb, "preemptible": job.rule.name in self.preemptible_rules, } # If the user wants gpus, add accelerators here if gpu_count: accelerator = self._get_accelerator(gpu_count, zone=selected["zone"], gpu_model=gpu_model) virtual_machine["accelerators"] = [{ "type": accelerator["name"], "count": gpu_count }] resources = { "regions": self.regions, "virtualMachine": virtual_machine } return resources
def notebook( path, basedir, input, output, params, wildcards, threads, resources, log, config, rulename, conda_env, container_img, singularity_args, env_modules, bench_record, jobid, bench_iteration, cleanup_scripts, shadow_dir, edit=None, ): """ Load a script from the given basedir + path and execute it. """ draft = False if edit is not None: if urlparse(path).scheme == "": if not os.path.isabs(path): local_path = os.path.join(basedir, path) else: local_path = path if not os.path.exists(local_path): # draft the notebook, it does not exist yet language = None draft = True path = "file://{}".format(os.path.abspath(local_path)) if path.endswith(".py.ipynb"): language = "jupyter_python" elif path.endswith(".r.ipynb"): language = "jupyter_r" else: raise WorkflowError( "Notebook to edit has to end on .py.ipynb or .r.ipynb in order " "to decide which programming language shall be used.") else: raise WorkflowError( "Notebook {} is not local, but edit mode is only allowed for " "local notebooks.".format(path)) if not draft: path, source, language = get_source(path, basedir) else: source = None exec_class = get_exec_class(language) executor = exec_class( path, source, basedir, input, output, params, wildcards, threads, resources, log, config, rulename, conda_env, container_img, singularity_args, env_modules, bench_record, jobid, bench_iteration, cleanup_scripts, shadow_dir, ) if draft: executor.draft(listen=edit) else: executor.evaluate(edit=edit)
import re import math import functools import concurrent.futures # module-specific from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider from snakemake.exceptions import WorkflowError, S3FileException try: # third-party modules import boto3 import botocore except ImportError as e: raise WorkflowError( "The Python 3 package 'boto3' " "needs to be installed to use S3 remote() file functionality. %s" % e.msg) class RemoteProvider(AbstractRemoteProvider): supports_default = True def __init__(self, *args, stay_on_remote=False, keep_local=False, is_default=False, **kwargs): super(RemoteProvider, self).__init__(*args, stay_on_remote=stay_on_remote,
def __init__(self, workflow, dag, cores, jobname="snakejob.{rulename}.{jobid}.sh", printreason=False, quiet=False, printshellcmds=False, latency_wait=3, benchmark_repeats=1, cluster_config=None): super().__init__(workflow, dag, printreason=printreason, quiet=quiet, printshellcmds=printshellcmds, latency_wait=latency_wait, benchmark_repeats=benchmark_repeats) if workflow.snakemakepath is None: raise ValueError("Cluster executor needs to know the path " "to the snakemake binary.") jobscript = workflow.jobscript if jobscript is None: jobscript = os.path.join(os.path.dirname(__file__), self.default_jobscript) try: with open(jobscript) as f: self.jobscript = f.read() except IOError as e: raise WorkflowError(e) if not "jobid" in get_wildcard_names(jobname): raise WorkflowError( "Defined jobname (\"{}\") has to contain the wildcard {jobid}." ) self.exec_job = ( 'cd {workflow.workdir_init} && ' '{workflow.snakemakepath} --snakefile {workflow.snakefile} ' '--force -j{cores} --keep-target-files ' '--wait-for-files {job.input} --latency-wait {latency_wait} ' '--benchmark-repeats {benchmark_repeats} ' '{overwrite_workdir} {overwrite_config} --nocolor ' '--notemp --quiet --no-hooks --nolock {target}') if printshellcmds: self.exec_job += " --printshellcmds " if not any(dag.dynamic_output_jobs): # disable restiction to target rule in case of dynamic rules! self.exec_job += " --allowed-rules {job.rule.name} " self.jobname = jobname self._tmpdir = None self.cores = cores if cores else "" self.cluster_config = cluster_config if cluster_config else dict() self.active_jobs = list() self.lock = threading.Lock() self.wait = True self.wait_thread = threading.Thread(target=self._wait_for_jobs) self.wait_thread.daemon = True self.wait_thread.start()
def check_broken_symlink(self): """ Raise WorkflowError if file is a broken symlink. """ if not self.exists_local and lstat(self.file): raise WorkflowError("File {} seems to be a broken symlink.".format( self.file))
def get_packages(): try: import pygments except ImportError: raise WorkflowError( "Python package pygments must be installed to create reports.") return Packages({ "snakemake": Package( version=snakemake.__version__.split("+")[0], license_url= "https://raw.githubusercontent.com/snakemake/snakemake/main/LICENSE.md", ), "pygments": Package( version=pygments.__version__, license_url= "https://raw.githubusercontent.com/pygments/pygments/master/LICENSE", ), "tailwindcss": Package( version="3.0", license_url= "https://raw.githubusercontent.com/tailwindlabs/tailwindcss/master/LICENSE", url= "https://cdn.tailwindcss.com/[email protected],[email protected]", ), "react": Package( version="17", license_url= "https://raw.githubusercontent.com/facebook/react/main/LICENSE", main="https://unpkg.com/react@17/umd/react.development.js", dom="https://unpkg.com/react-dom@17/umd/react-dom.development.js", ), "vega": Package( version="5.21", url="https://cdnjs.cloudflare.com/ajax/libs/vega/5.21.0/vega.js", license_url= "https://raw.githubusercontent.com/vega/vega/main/LICENSE", ), "vega-lite": Package( version="5.2", url= "https://cdnjs.cloudflare.com/ajax/libs/vega-lite/5.2.0/vega-lite.js", license_url= "https://raw.githubusercontent.com/vega/vega-lite/next/LICENSE", ), "vega-embed": Package( version="6.20", url= "https://cdnjs.cloudflare.com/ajax/libs/vega-embed/6.20.8/vega-embed.js", license_url= "https://raw.githubusercontent.com/vega/vega-embed/next/LICENSE", ), "heroicons": Package( version="1.0.6", license_url= "https://raw.githubusercontent.com/tailwindlabs/heroicons/master/LICENSE", ), })
import re import json import logging import xml.etree.ElementTree as ET # module-specific from snakemake.remote import AbstractRemoteObject, AbstractRemoteProvider from snakemake.exceptions import WorkflowError, NCBIFileException from snakemake.logging import logger try: # third-party modules from Bio import Entrez except ImportError as e: raise WorkflowError( "The Python package 'biopython' needs to be installed to use NCBI Entrez remote() file functionality. %s" % e.msg) class RemoteProvider(AbstractRemoteProvider): def __init__(self, *args, keep_local=False, stay_on_remote=False, is_default=False, email=None, **kwargs): super(RemoteProvider, self).__init__(*args, keep_local=keep_local, stay_on_remote=stay_on_remote, is_default=is_default,