def get_bustools_rid(params): """ Extract the position of the fastq containig reads from the bustools -x argument. The read_id is the first pos of the last triplet in the bc:umi:read string or hard-coded for short-hand syntax. In: -x 10xv3 -> read_id=1 In: -x 0,0,16:0,16,26:1,0,0 -> read_id=1 """ kb_tech_dict = { "10xv2": 1, "10xv3": 1, "celseq": 1, "celseq2": 1, "dropseq": 1, "scrubseq": 1, "indropsv1": 1, "indropsv2": 0, } # Check for occurence of short-hand tech bus_regex = "(?<!\S)([0-1],\d*,\d*:){2}([0-1],0,0)(?!\S)" bus_regex_short = "(?i)\\b(10XV2|10XV3|CELSEQ|CELSEQ2|DROPSEQ|SCRUBSEQ|INDROPSV1|INDROPSV2)\\b" if re.search(bus_regex, params) != None: match = re.search(bus_regex, params).group(0) read_id = int(match.split(":")[-1].split(",")[0]) elif re.search(bus_regex_short, params) != None: tech = re.search(bus_regex_short, params).group(0) read_id = kb_tech_dict[tech.lower()] else: logger.error( "Not a valid BUS(barcode:umi:set) string. Please check -x argument" ) os._exit(1) # noqa return read_id
def samples2metadata_local(samples: List[str], config: dict, logger) -> dict: """ (try to) get the metadata of local samples """ sampledict = dict() for sample in samples: if os.path.exists( expand(f'{{fastq_dir}}/{sample}.{{fqsuffix}}.gz', **config)[0]): sampledict[sample] = dict() sampledict[sample]["layout"] = "SINGLE" elif all( os.path.exists(path) for path in expand( f'{{fastq_dir}}/{sample}_{{fqext}}.{{fqsuffix}}.gz', ** config)): sampledict[sample] = dict() sampledict[sample]["layout"] = "PAIRED" elif sample.startswith(('GSM', 'SRX', 'SRR', 'ERR', 'DRR')): continue else: logger.error( f"\nsample {sample} was not found..\n" f"We checked for SE file:\n" f"\t{config['fastq_dir']}/{sample}.{config['fqsuffix']}.gz \n" f"and for PE files:\n" f"\t{config['fastq_dir']}/{sample}_{config['fqext1']}.{config['fqsuffix']}.gz \n" f"\t{config['fastq_dir']}/{sample}_{config['fqext2']}.{config['fqsuffix']}.gz \n" f"and since the sample did not start with either GSM, SRX, SRR, ERR, and DRR we " f"couldn't find it online..\n") raise TerminatedException return sampledict
def schedule(self): """ Schedule jobs that are ready, maximizing cpu usage. """ try: while True: # work around so that the wait does not prevent keyboard interrupts while not self._open_jobs.wait(1): pass # obtain needrun and running jobs in a thread-safe way with self._lock: needrun = list(self.open_jobs) running = list(self.running) # free the event self._open_jobs.clear() # handle errors if not self.keepgoing and self._errors: logger.info("Will exit after finishing " "currently running jobs.") if not running: self._executor.shutdown() logger.error(_ERROR_MSG_FINAL) return False continue # normal shutdown because all jobs have been finished if not needrun and not running: self._executor.shutdown() if self._errors: logger.error(_ERROR_MSG_FINAL) return not self._errors # continue if no new job needs to be executed if not needrun: continue logger.debug("Resources before job selection: {}".format( self.resources)) logger.debug("Ready jobs ({}):\n\t".format(len(needrun)) + "\n\t".join(map(str, needrun))) # select jobs by solving knapsack problem run = self.job_selector(needrun) logger.debug("Selected jobs ({}):\n\t".format(len(run)) + "\n\t".join(map(str, run))) # update running jobs with self._lock: self.running.update(run) logger.debug( "Resources after job selection: {}".format(self.resources)) # actually run jobs for job in run: self.run(job) except (KeyboardInterrupt, SystemExit): logger.info("Terminating processes on user request.") self._executor.cancel() with self._lock: running = list(self.running) for job in running: job.cleanup() return False
def get_dna_paths(url, ext='', proxy=None, params={}): """https://stackoverflow.com/questions/11023530/python-to-list-http-files-and-directories """ if isinstance(proxy, str): proxy = {'ftp': proxy, 'http': proxy, 'https': proxy} response = requests.get(url, proxies=proxy, params=params) if response.ok: response_text = response.text else: logger.error("query failed: {}".format(response.status_code)) return '' soup = BeautifulSoup(response_text, 'html.parser') parent = [ os.path.join(url, node.get('href')) for node in soup.find_all('a') if node.get('href').endswith(ext) ] #1) return primary assembly if present for p in parent: if p.endswith('primary_assembly.fa.gz'): return [p] #2) return toplevel if present for p in parent: if p.endswith('dna.toplevel.fa.gz'): return [p] #3) else try to fetch individual chromosomes chrom = [] for p in parent: if ('dna.chromosome' in p) or ('dna.nonchromosomal' in p): chrom.append(p) if len(chrom) == 0: logger.error("failed to identify any valid dna in: {}".format( str(parent))) return chrom
def _get_bucket(self): """get a connection to the storage bucket (self.bucket) and exit if the name is taken or otherwise invalid. Parameters ========== workflow: the workflow object to derive the prefix from """ import google # Hold path to requested subdirectory and main bucket bucket_name = self.workflow.default_remote_prefix.split("/")[0] self.gs_subdir = re.sub("^{}/".format(bucket_name), "", self.workflow.default_remote_prefix) self.gs_logs = os.path.join(self.gs_subdir, "google-lifesciences-logs") # Case 1: The bucket already exists try: self.bucket = self._bucket_service.get_bucket(bucket_name) # Case 2: The bucket needs to be created except google.cloud.exceptions.NotFound: self.bucket = self._bucket_service.create_bucket(bucket_name) # Case 2: The bucket name is already taken except Exception as ex: logger.error("Cannot get or create {} (exit code {}):\n{}".format( bucket_name, ex.returncode, ex.output.decode())) log_verbose_traceback(ex) raise ex logger.debug("bucket=%s" % self.bucket.name) logger.debug("subdir=%s" % self.gs_subdir) logger.debug("logs=%s" % self.gs_logs)
def color_parser(color: str, color_dicts: list = None) -> tuple: """ convert a string with RGB/matplotlib named colors to matplotlib HSV tuples. supports RGB colors with ranges between 0-1 or 0-255. supported matplotlib colors can be found here: https://matplotlib.org/3.3.1/gallery/color/named_colors.html """ # input: RGB if color.count(",") == 2: value = [float(c) for c in color.split(",")] return rgb_to_hsv(value) # input: matplotlib colors cdicts = color_dicts if color_dicts else DEFAULT_COLOR_DICTS for cdict in cdicts: if color in cdict: value = cdict[color] # tableau, css4 and xkcd return hex colors. if str(value).startswith("#"): value = hex_to_rgb(value) return rgb_to_hsv(value) logger.error(f"Color not recognized: {color}") os._exit(1) # noqa
def schedule(self): """ Schedule jobs that are ready, maximizing cpu usage. """ try: while True: # work around so that the wait does not prevent keyboard interrupts while not self._open_jobs.wait(1): pass # obtain needrun and running jobs in a thread-safe way with self._lock: needrun = list(self.open_jobs) running = list(self.running) # free the event self._open_jobs.clear() # handle errors if not self.keepgoing and self._errors: logger.info("Will exit after finishing " "currently running jobs.") if not running: self._executor.shutdown() logger.error(_ERROR_MSG_FINAL) return False continue # normal shutdown because all jobs have been finished if not needrun and not running: self._executor.shutdown() if self._errors: logger.error(_ERROR_MSG_FINAL) return not self._errors # continue if no new job needs to be executed if not needrun: continue logger.debug("Resources before job selection: {}".format( self.resources)) logger.debug("Ready jobs ({}):\n\t".format(len(needrun)) + "\n\t".join(map(str, needrun))) # select jobs by solving knapsack problem run = self.job_selector(needrun) logger.debug("Selected jobs ({}):\n\t".format(len(run)) + "\n\t".join(map(str, run))) # update running jobs with self._lock: self.running.update(run) logger.debug("Resources after job selection: {}".format( self.resources)) # actually run jobs for job in run: self.run(job) except (KeyboardInterrupt, SystemExit): logger.info("Terminating processes on user request.") self._executor.cancel() with self._lock: running = list(self.running) for job in running: job.cleanup() return False
def get_release_ensemblgenomes(release=None): """returns version of ensemblgenomes matching ensembl release """ try: data = _ens_rest_query(ext='/info/eg_version?', release=release) return str(int(data['version'])) except: logger.error("release parse error or protocol error at {}".format( "ensemblgenomes release query"))
def construct_mapping(self, node, deep=False): mapping = [] for key_node, value_node in node.value: key = self.construct_object(key_node, deep=deep).lower() if key in mapping: logger.error( f"Duplicate key found in the config.yaml: {key}\n") os._exit(1) # noqa mapping.append(key) return super().construct_mapping(node, deep)
def get_release_current_ensembl(): """returns current version of ensembl """ try: data = _ens_rest_query(ext='/info/data?') release = int(data['releases'][0]) return str(int(data['releases'][0])) except: logger.error("release parse error or protocol error at {}".format( "ensembl release query"))
def __init__(self): # check if it is already initialized if self.instance != None: self.config = self.instance.config self.conf_dict = self.instance.conf_dict self.args = self.instance.args self.path = self.instance.path self.snakefile = self.instance.snakefile self.snakeroot = self.instance.snakeroot return # we dont need the first argument aka call to snakemake self.sysargs = sys.argv[1:] parser = get_argument_parser() self.args = parser.parse_args(self.sysargs) self.path = self.args.configfile self.snakefile = self.args.snakefile self.config = parse_config(self.args) if self.path is None: for p in ["wbuild.yaml", "config.yaml", "wBuild.yaml"]: if os.path.exists(p): self.path = p break else: if type(self.path) is list: self.path = self.path[0] self.path = os.path.abspath(self.path) # this is taken from the snakemake main file if self.snakefile is None: for p in SNAKEFILE_CHOICES: if os.path.exists(p): self.snakefile = p break self.snakeroot = os.path.dirname(self.snakefile) #load defaults self.loadDefaultConfiguration() try: fh = open(self.path, "r") except IOError: raise IOError("Can not read config. Are you sure you have enough " "rights and config path (wbuild.yaml) is right?") configDict = next(yaml.safe_load_all(fh)) if configDict == None: logger.error( "Error parsing wbuild.yaml - format is wrong. Working with defaults..." ) else: self.conf_dict = merge_two_dicts(self.conf_dict, configDict) #fill Singleton Config.instance = self
def log_error(self, msg=None, indent=False, **kwargs): logger.job_error( name=self.rule.name, jobid=self.dag.jobid(self), output=list(format_files(self, self.output, self.dynamic_output)), log=list(self.log), conda_env=self.conda_env.path if self.conda_env else None, aux=kwargs, indent=indent) if msg is not None: logger.error(msg)
def shellcmd(img_path, cmd, args="", envvars=None, shell_executable=None, container_workdir=None): """Execute shell command inside singularity container given optional args and environment variables to be passed.""" # if img_path is given here, why do we have self.path?? # I suppose this is needed for the very smart inheritance of ENVS by singularity # if envvars: # envvars = " ".join("SINGULARITYENV_{}={}".format(k, v) # for k, v in envvars.items()) # else: # envvars = "" if container_workdir is not None: logger.error( '--use-docker do not support shadow dir, TO BE IMPLEMENTED') # We will handle HERE occam based on an env variable to avoid code duplication if shell_executable is None: shell_executable = "sh" else: # Ensure to just use the name of the executable, not a path, # because we cannot be sure where it is located in the container. shell_executable = os.path.split(shell_executable)[-1] # TODO add uid wd = os.getcwd() # this won't work for subworkflows? TODO print("*********************", wd) if os.environ.get('RUN_ENV') == 'occam': #occam-run -v /home/egrassi:/home/egrassi egrassi/bit_docker:small sh -c "cd /home/egrassi/bit_docker; bmake test_bmaked" args_v = re.sub('--volume', '-v', args) # workaroud for configargparse problem with --docker-args "-v ..." with -v as a snakemake argument cmd = "cd {};".format(wd) + cmd cmd = "occam-run {} {} {} -c '{}'".format(args_v, img_path, shell_executable, cmd.replace("'", r"'\''")) else: args += " -v {}:{}".format( SNAKEMAKE_SEARCHPATH, SNAKEMAKE_MOUNTPOINT) # TODO try to remove and see what breaks # TODO we need to mount current dir or do we leave it to the user? user for now, need to mount whole bioinfo_root/equivalent args += " --user {}:{} ".format(os.getuid(), os.getgid()) cmd = "cd {};".format(wd) + cmd cmd = "docker run {} {} {} -c '{}'".format(args, img_path, shell_executable, cmd.replace("'", r"'\''")) logger.debug(cmd) return cmd
def wrap(*args, **kwargs): # we get two tries, in case parallel executions are interfering with one another for _ in range(2): try: return func(*args, **kwargs) except FileNotFoundError: time.sleep(1) else: logger.error( "There were some problems with locking the seq2science cache. Please try again in a bit." ) os._exit(1) # noqa
def _convert_units_to_mb(memory): """If memory is specified with SI unit, convert to MB""" if isinstance(memory, int) or isinstance(memory, float): return int(memory) siunits = {"K": 1e-3, "M": 1, "G": 1e3, "T": 1e6} regex = re.compile(r"(\d+)({})$".format("|".join(siunits.keys()))) m = regex.match(memory) if m is None: logger.error((f"unsupported memory specification '{memory}';" " allowed suffixes: [K|M|G|T]")) sys.exit(1) factor = siunits[m.group(2)] return int(int(m.group(1)) * factor)
def get_refseq_assembly_summary(url, release, proxy=None, protocol='https://'): if isinstance(proxy, str): proxy = {'ftp': proxy, 'http': proxy, 'https': proxy} if proxy: proxy_support = request.ProxyHandler(proxy) request.install_opener(request.build_opener(proxy_support)) logger.info('fetching {} ...'.format(protocol+ url)) with closing(request.urlopen(protocol + url)) as response: out = response.read().decode('utf-8') try: tab = pd.read_csv(io.StringIO(out), sep="\t", skiprows=1) except: logger.error('failed to parse url into df {}'.format(url)) return tab
def get_file_hash(filename, algorithm="sha256"): """find the SHA256 hash string of a file. We use this so that the user can choose to cache working directories in storage. """ from snakemake.logging import logger # The algorithm must be available try: hasher = hashlib.new(algorithm) except ValueError as ex: logger.error("%s is not an available algorithm." % algorithm) raise ex with open(filename, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hasher.update(chunk) return hasher.hexdigest()
def run(self, job, callback=None, submit_callback=None, error_callback=None): super()._run(job) workdir = os.getcwd() jobid = self.dag.jobid(job) jobscript = self.get_jobscript(job) jobfinished = os.path.join(self.tmpdir, "{}.jobfinished".format(jobid)) jobfailed = os.path.join(self.tmpdir, "{}.jobfailed".format(jobid)) self.spawn_jobscript(job, jobscript, jobfinished=jobfinished, jobfailed=jobfailed) deps = " ".join(self.external_jobid[f] for f in job.input if f in self.external_jobid) try: submitcmd = job.format_wildcards( self.submitcmd, dependencies=deps, cluster=self.cluster_wildcards(job)) except AttributeError as e: raise WorkflowError(str(e), rule=job.rule) try: ext_jobid = subprocess.check_output( '{submitcmd} "{jobscript}"'.format(submitcmd=submitcmd, jobscript=jobscript), shell=True).decode().split("\n") except subprocess.CalledProcessError as ex: logger.error("Error submitting jobscript (exit code {}):\n{}".format( ex.returncode, ex.output.decode())) error_callback(job) return if ext_jobid and ext_jobid[0]: ext_jobid = ext_jobid[0] self.external_jobid.update((f, ext_jobid) for f in job.output) logger.debug("Submitted job {} with external jobid {}.".format( jobid, ext_jobid)) submit_callback(job) with self.lock: self.active_jobs.append(GenericClusterJob(job, callback, error_callback, jobscript, jobfinished, jobfailed))
def assert_versions(rules_dir): """ For each package, check that the installed version matches the required version """ error = False yaml_file = _get_yaml_file(rules_dir) versions = _get_yaml_versions(yaml_file) for package, required_version in versions.items(): current_version = _get_current_version(package) if current_version is None: continue if not current_version.startswith(required_version): logger.error( f"Seq2science requires {package.capitalize()} version {required_version}, " f"found version {current_version}.") error = True if error: logger.error("Please create a new conda environment.\n") os._exit(1) # noqa
def parseYamlParams(header, f): """ :param header: String form of YAML header :param f: Filename of a file from where the header was parsed :return: Parameters dictionary parsed from the header; None if parsing errors occured """ try: param = next(yaml.safe_load_all(header)) except (yaml.scanner.ScannerError, yaml.parser.ParserError, yaml.error.YAMLError, yaml.error.MarkedYAMLError) as e: if hasattr(e, 'problem_mark'): if e.context != None: logger.error('Error while parsing YAML area in the file ' + f + ':\n' + str(e.problem_mark) + '\n ' + str(e.problem) + ' ' + str(e.context) + '\nPlease correct the header and retry.') else: logger.error('Error while parsing YAML area in the file ' + f + ':\n' + str(e.problem_mark) + '\n ' + str(e.problem) + '\nPlease correct the header and retry.') else: logger.error("YAMLError parsing yaml file.") return None except Exception as e: print( bcolors.FAIL + bcolors.BOLD + 'Could not parse', f, '. Include valid yaml header. Not showing any further errors. \n', 'Errors {0}'.format(e) + bcolors.ENDC) return None logger.debug("Parsed params: " + str(param) + "\n.") return param
def samples2metadata_local(samples: List[str], config: dict, logger) -> dict: """ (try to) get the metadata of local samples """ SAMPLEDICT = dict() for sample in samples: local_fastqs = glob.glob( os.path.join(config["fastq_dir"], f'{sample}*{config["fqsuffix"]}*.gz')) if len(local_fastqs) == 1: SAMPLEDICT[sample] = dict() SAMPLEDICT[sample]["layout"] = "SINGLE" elif (len(local_fastqs) == 2 and any([ config["fqext1"] in os.path.basename(f) for f in local_fastqs ]) and any( [config["fqext2"] in os.path.basename(f) for f in local_fastqs])): SAMPLEDICT[sample] = dict() SAMPLEDICT[sample]["layout"] = "PAIRED" elif sample.startswith( ("GSM", "DRX", "ERX", "SRX", "DRR", "ERR", "SRR")): continue else: extend_msg = "" if len(local_fastqs) > 2: extend_msg = ( f"We found too many files matching ({len(local_fastqs)}) " "and could not distinguish them:\n" + ", ".join([os.path.basename(f) for f in local_fastqs]) + ".\n") logger.error( f"\nsample {sample} was not found..\n" f"We checked directory '{config['fastq_dir']}' " f"for gzipped files starting with '{sample}' and containing '{config['fqsuffix']}'.\n" + extend_msg + f"Since the sample did not start with either GSM, SRX, SRR, ERR, and DRR we " f"couldn't find it online..\n") os._exit(1) # noqa return SAMPLEDICT
def get_current_refseq_from_www(proxy={}, protocol='https://'): url = 'ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER' if isinstance(proxy, str): proxy = {'ftp': proxy, 'http': proxy, 'https': proxy} if proxy: proxy_support = request.ProxyHandler(proxy) request.install_opener(request.build_opener(proxy_support)) with closing(request.urlopen(protocol + url)) as response: try: logger.info('fetching {} ...'.format(url)) out = response.read() release = str(int(re.findall(b"\d+", out)[0])) except: raise logger.error("release parse error or protocol error") return str(release)
def print_exception(ex, linemaps): """ Print an error message for a given exception. Arguments ex -- the exception linemaps -- a dict of a dict that maps for each snakefile the compiled lines to source code lines in the snakefile. """ log_verbose_traceback(ex) if isinstance(ex, SyntaxError) or isinstance(ex, IndentationError): logger.error( format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.filename, show_traceback=True)) return origin = get_exception_origin(ex, linemaps) if origin is not None: lineno, file = origin logger.error( format_error(ex, lineno, linemaps=linemaps, snakefile=file, show_traceback=True)) return elif isinstance(ex, TokenError): logger.error(format_error(ex, None, show_traceback=False)) elif isinstance(ex, MissingRuleException): logger.error( format_error(ex, None, linemaps=linemaps, snakefile=ex.filename, show_traceback=False)) elif isinstance(ex, RuleException): for e in ex._include + [ex]: if not e.omit: logger.error( format_error(e, e.lineno, linemaps=linemaps, snakefile=e.filename, show_traceback=True)) elif isinstance(ex, WorkflowError): logger.error( format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.snakefile, show_traceback=True)) elif isinstance(ex, KeyboardInterrupt): logger.info("Cancelling snakemake on user request.") else: traceback.print_exception(type(ex), ex, ex.__traceback__)
def execute( self, targets=None, dryrun=False, touch=False, local_cores=1, forcetargets=False, forceall=False, forcerun=None, until=[], omit_from=[], prioritytargets=None, quiet=False, keepgoing=False, printshellcmds=False, printreason=False, printdag=False, cluster=None, cluster_sync=None, jobname=None, immediate_submit=False, ignore_ambiguity=False, printrulegraph=False, printfilegraph=False, printd3dag=False, drmaa=None, drmaa_log_dir=None, kubernetes=None, tibanna=None, tibanna_sfn=None, precommand="", tibanna_config=False, container_image=None, stats=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, list_untracked=False, list_conda_envs=False, summary=False, archive=None, delete_all_output=False, delete_temp_output=False, detailed_summary=False, latency_wait=3, wait_for_files=None, nolock=False, unlock=False, notemp=False, nodeps=False, cleanup_metadata=None, conda_cleanup_envs=False, cleanup_shadow=False, cleanup_scripts=True, subsnakemake=None, updated_files=None, keep_target_files=False, keep_shadow=False, keep_remote_local=False, allowed_rules=None, max_jobs_per_second=None, max_status_checks_per_second=None, greediness=1.0, no_hooks=False, force_use_threads=False, conda_create_envs_only=False, assume_shared_fs=True, cluster_status=None, report=None, report_stylesheet=None, export_cwl=False, batch=None, keepincomplete=False, ): self.check_localrules() self.immediate_submit = immediate_submit self.cleanup_scripts = cleanup_scripts def rules(items): return map(self._rules.__getitem__, filter(self.is_rule, items)) if keep_target_files: def files(items): return filterfalse(self.is_rule, items) else: def files(items): relpath = lambda f: f if os.path.isabs(f) else os.path.relpath( f) return map(relpath, filterfalse(self.is_rule, items)) if not targets: targets = [self.first_rule ] if self.first_rule is not None else list() if prioritytargets is None: prioritytargets = list() if forcerun is None: forcerun = list() if until is None: until = list() if omit_from is None: omit_from = list() priorityrules = set(rules(prioritytargets)) priorityfiles = set(files(prioritytargets)) forcerules = set(rules(forcerun)) forcefiles = set(files(forcerun)) untilrules = set(rules(until)) untilfiles = set(files(until)) omitrules = set(rules(omit_from)) omitfiles = set(files(omit_from)) targetrules = set( chain( rules(targets), filterfalse(Rule.has_wildcards, priorityrules), filterfalse(Rule.has_wildcards, forcerules), filterfalse(Rule.has_wildcards, untilrules), )) targetfiles = set( chain(files(targets), priorityfiles, forcefiles, untilfiles)) if forcetargets: forcefiles.update(targetfiles) forcerules.update(targetrules) rules = self.rules if allowed_rules: rules = [rule for rule in rules if rule.name in set(allowed_rules)] if wait_for_files is not None: try: snakemake.io.wait_for_files(wait_for_files, latency_wait=latency_wait) except IOError as e: logger.error(str(e)) return False dag = DAG( self, rules, dryrun=dryrun, targetfiles=targetfiles, targetrules=targetrules, # when cleaning up conda, we should enforce all possible jobs # since their envs shall not be deleted forceall=forceall or conda_cleanup_envs, forcefiles=forcefiles, forcerules=forcerules, priorityfiles=priorityfiles, priorityrules=priorityrules, untilfiles=untilfiles, untilrules=untilrules, omitfiles=omitfiles, omitrules=omitrules, ignore_ambiguity=ignore_ambiguity, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete or printdag or printrulegraph or printfilegraph, notemp=notemp, keep_remote_local=keep_remote_local, batch=batch, ) self.persistence = Persistence( nolock=nolock, dag=dag, conda_prefix=self.conda_prefix, singularity_prefix=self.singularity_prefix, shadow_prefix=self.shadow_prefix, warn_only=dryrun or printrulegraph or printfilegraph or printdag or summary or archive or list_version_changes or list_code_changes or list_input_changes or list_params_changes or list_untracked or delete_all_output or delete_temp_output, ) if cleanup_metadata: for f in cleanup_metadata: self.persistence.cleanup_metadata(f) return True logger.info("Building DAG of jobs...") dag.init() dag.update_checkpoint_dependencies() # check incomplete has to run BEFORE any call to postprocess dag.check_incomplete() dag.check_dynamic() if unlock: try: self.persistence.cleanup_locks() logger.info("Unlocking working directory.") return True except IOError: logger.error("Error: Unlocking the directory {} failed. Maybe " "you don't have the permissions?") return False try: self.persistence.lock() except IOError: logger.error( "Error: Directory cannot be locked. Please make " "sure that no other Snakemake process is trying to create " "the same files in the following directory:\n{}\n" "If you are sure that no other " "instances of snakemake are running on this directory, " "the remaining lock was likely caused by a kill signal or " "a power loss. It can be removed with " "the --unlock argument.".format(os.getcwd())) return False if cleanup_shadow: self.persistence.cleanup_shadow() return True if (self.subworkflows and not printdag and not printrulegraph and not printfilegraph): # backup globals globals_backup = dict(self.globals) # execute subworkflows for subworkflow in self.subworkflows: subworkflow_targets = subworkflow.targets(dag) logger.debug( "Files requested from subworkflow:\n {}".format( "\n ".join(subworkflow_targets))) updated = list() if subworkflow_targets: logger.info("Executing subworkflow {}.".format( subworkflow.name)) if not subsnakemake( subworkflow.snakefile, workdir=subworkflow.workdir, targets=subworkflow_targets, configfiles=[subworkflow.configfile] if subworkflow.configfile else None, updated_files=updated, ): return False dag.updated_subworkflow_files.update( subworkflow.target(f) for f in updated) else: logger.info("Subworkflow {}: Nothing to be done.".format( subworkflow.name)) if self.subworkflows: logger.info("Executing main workflow.") # rescue globals self.globals.update(globals_backup) dag.postprocess() # deactivate IOCache such that from now on we always get updated # size, existence and mtime information # ATTENTION: this may never be removed without really good reason. # Otherwise weird things may happen. self.iocache.deactivate() # clear and deactivate persistence cache, from now on we want to see updates self.persistence.deactivate_cache() if nodeps: missing_input = [ f for job in dag.targetjobs for f in job.input if dag.needrun(job) and not os.path.exists(f) ] if missing_input: logger.error( "Dependency resolution disabled (--nodeps) " "but missing input " "files detected. If this happens on a cluster, please make sure " "that you handle the dependencies yourself or turn off " "--immediate-submit. Missing input files:\n{}".format( "\n".join(missing_input))) return False updated_files.extend(f for job in dag.needrun_jobs for f in job.output) if export_cwl: from snakemake.cwl import dag_to_cwl import json with open(export_cwl, "w") as cwl: json.dump(dag_to_cwl(dag), cwl, indent=4) return True elif report: from snakemake.report import auto_report auto_report(dag, report, stylesheet=report_stylesheet) return True elif printd3dag: dag.d3dag() return True elif printdag: print(dag) return True elif printrulegraph: print(dag.rule_dot()) return True elif printfilegraph: print(dag.filegraph_dot()) return True elif summary: print("\n".join(dag.summary(detailed=False))) return True elif detailed_summary: print("\n".join(dag.summary(detailed=True))) return True elif archive: dag.archive(archive) return True elif delete_all_output: dag.clean(only_temp=False, dryrun=dryrun) return True elif delete_temp_output: dag.clean(only_temp=True, dryrun=dryrun) return True elif list_version_changes: items = list( chain(*map(self.persistence.version_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_code_changes: items = list(chain(*map(self.persistence.code_changed, dag.jobs))) for j in dag.jobs: items.extend(list(j.outputs_older_than_script_or_notebook())) if items: print(*items, sep="\n") return True elif list_input_changes: items = list(chain(*map(self.persistence.input_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_params_changes: items = list( chain(*map(self.persistence.params_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_untracked: dag.list_untracked() return True if self.use_singularity: if assume_shared_fs: dag.pull_container_imgs(dryrun=dryrun or list_conda_envs, quiet=list_conda_envs) if self.use_conda: if assume_shared_fs: dag.create_conda_envs( dryrun=dryrun or list_conda_envs or conda_cleanup_envs, quiet=list_conda_envs, ) if conda_create_envs_only: return True if list_conda_envs: print("environment", "container", "location", sep="\t") for env in set(job.conda_env for job in dag.jobs): if env: print( simplify_path(env.file), env.container_img_url or "", simplify_path(env.path), sep="\t", ) return True if conda_cleanup_envs: self.persistence.conda_cleanup_envs() return True scheduler = JobScheduler( self, dag, self.cores, local_cores=local_cores, dryrun=dryrun, touch=touch, cluster=cluster, cluster_status=cluster_status, cluster_config=cluster_config, cluster_sync=cluster_sync, jobname=jobname, max_jobs_per_second=max_jobs_per_second, max_status_checks_per_second=max_status_checks_per_second, quiet=quiet, keepgoing=keepgoing, drmaa=drmaa, drmaa_log_dir=drmaa_log_dir, kubernetes=kubernetes, tibanna=tibanna, tibanna_sfn=tibanna_sfn, precommand=precommand, tibanna_config=tibanna_config, container_image=container_image, printreason=printreason, printshellcmds=printshellcmds, latency_wait=latency_wait, greediness=greediness, force_use_threads=force_use_threads, assume_shared_fs=assume_shared_fs, keepincomplete=keepincomplete, ) if not dryrun: if len(dag): shell_exec = shell.get_executable() if shell_exec is not None: logger.info("Using shell: {}".format(shell_exec)) if cluster or cluster_sync or drmaa: logger.resources_info("Provided cluster nodes: {}".format( self.nodes)) else: warning = ("" if self.cores > 1 else " (use --cores to define parallelism)") logger.resources_info("Provided cores: {}{}".format( self.cores, warning)) logger.resources_info("Rules claiming more threads " "will be scaled down.") provided_resources = format_resources(self.global_resources) if provided_resources: logger.resources_info("Provided resources: " + provided_resources) if self.run_local and any(rule.group for rule in self.rules): logger.info("Group jobs: inactive (local execution)") if not self.use_conda and any(rule.conda_env for rule in self.rules): logger.info("Conda environments: ignored") if not self.use_singularity and any(rule.container_img for rule in self.rules): logger.info("Singularity containers: ignored") logger.run_info("\n".join(dag.stats())) else: logger.info("Nothing to be done.") else: # the dryrun case if len(dag): logger.run_info("\n".join(dag.stats())) else: logger.info("Nothing to be done.") return True if quiet: # in case of dryrun and quiet, just print above info and exit return True if not dryrun and not no_hooks: self._onstart(logger.get_logfile()) success = scheduler.schedule() if success: if dryrun: if len(dag): logger.run_info("\n".join(dag.stats())) logger.info("This was a dry-run (flag -n). The order of jobs " "does not reflect the order of execution.") logger.remove_logfile() else: if stats: scheduler.stats.to_json(stats) logger.logfile_hint() if not dryrun and not no_hooks: self._onsuccess(logger.get_logfile()) return True else: if not dryrun and not no_hooks: self._onerror(logger.get_logfile()) logger.logfile_hint() return False
def schedule(self): """ Schedule jobs that are ready, maximizing cpu usage. """ try: while True: # work around so that the wait does not prevent keyboard interrupts # while not self._open_jobs.acquire(False): # time.sleep(1) self._open_jobs.acquire() # obtain needrun and running jobs in a thread-safe way with self._lock: needrun = list(self.open_jobs) running = list(self.running) errors = self._errors user_kill = self._user_kill # handle errors if user_kill or (not self.keepgoing and errors): if user_kill == "graceful": logger.info("Will exit after finishing " "currently running jobs.") if not running: logger.info( "Shutting down, this might take some time.") self._executor.shutdown() if not user_kill: logger.error(_ERROR_MSG_FINAL) return False continue # normal shutdown because all jobs have been finished if not needrun and (not running or self.workflow.immediate_submit): self._executor.shutdown() if errors: logger.error(_ERROR_MSG_FINAL) return not errors # continue if no new job needs to be executed if not needrun: continue # select jobs by solving knapsack problem (omit with dryrun) if self.dryrun: run = needrun else: logger.debug("Resources before job selection: {}".format( self.resources)) logger.debug("Ready jobs ({}):\n\t".format(len(needrun)) + "\n\t".join(map(str, needrun))) run = self.job_selector(needrun) logger.debug("Selected jobs ({}):\n\t".format(len(run)) + "\n\t".join(map(str, run))) logger.debug("Resources after job selection: {}".format( self.resources)) # update running jobs with self._lock: self.running.update(run) # actually run jobs for job in run: with self.rate_limiter: self.run(job) except (KeyboardInterrupt, SystemExit): logger.info( "Terminating processes on user request, this might take some time." ) self._executor.cancel() return False
def execute(self, targets=None, dryrun=False, touch=False, cores=1, nodes=1, local_cores=1, forcetargets=False, forceall=False, forcerun=None, prioritytargets=None, quiet=False, keepgoing=False, printshellcmds=False, printreason=False, printdag=False, cluster=None, cluster_config=None, cluster_sync=None, jobname=None, immediate_submit=False, ignore_ambiguity=False, printrulegraph=False, printd3dag=False, drmaa=None, stats=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, summary=False, detailed_summary=False, latency_wait=3, benchmark_repeats=3, wait_for_files=None, nolock=False, unlock=False, resources=None, notemp=False, nodeps=False, cleanup_metadata=None, subsnakemake=None, updated_files=None, keep_target_files=False, allowed_rules=None, greediness=1.0, no_hooks=False): self.global_resources = dict() if resources is None else resources self.global_resources["_cores"] = cores self.global_resources["_nodes"] = nodes def rules(items): return map(self._rules.__getitem__, filter(self.is_rule, items)) if keep_target_files: def files(items): return filterfalse(self.is_rule, items) else: def files(items): return map(os.path.relpath, filterfalse(self.is_rule, items)) if not targets: targets = [self.first_rule ] if self.first_rule is not None else list() if prioritytargets is None: prioritytargets = list() if forcerun is None: forcerun = list() priorityrules = set(rules(prioritytargets)) priorityfiles = set(files(prioritytargets)) forcerules = set(rules(forcerun)) forcefiles = set(files(forcerun)) targetrules = set(chain(rules(targets), filterfalse(Rule.has_wildcards, priorityrules), filterfalse(Rule.has_wildcards, forcerules))) targetfiles = set(chain(files(targets), priorityfiles, forcefiles)) if forcetargets: forcefiles.update(targetfiles) forcerules.update(targetrules) rules = self.rules if allowed_rules: rules = [rule for rule in rules if rule.name in set(allowed_rules)] if wait_for_files is not None: try: snakemake.io.wait_for_files(wait_for_files, latency_wait=latency_wait) except IOError as e: logger.error(str(e)) return False dag = DAG( self, rules, dryrun=dryrun, targetfiles=targetfiles, targetrules=targetrules, forceall=forceall, forcefiles=forcefiles, forcerules=forcerules, priorityfiles=priorityfiles, priorityrules=priorityrules, ignore_ambiguity=ignore_ambiguity, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete or printdag or printrulegraph, notemp=notemp) self.persistence = Persistence( nolock=nolock, dag=dag, warn_only=dryrun or printrulegraph or printdag or summary or list_version_changes or list_code_changes or list_input_changes or list_params_changes) if cleanup_metadata: for f in cleanup_metadata: self.persistence.cleanup_metadata(f) return True dag.init() dag.check_dynamic() if unlock: try: self.persistence.cleanup_locks() logger.info("Unlocking working directory.") return True except IOError: logger.error("Error: Unlocking the directory {} failed. Maybe " "you don't have the permissions?") return False try: self.persistence.lock() except IOError: logger.error( "Error: Directory cannot be locked. Please make " "sure that no other Snakemake process is trying to create " "the same files in the following directory:\n{}\n" "If you are sure that no other " "instances of snakemake are running on this directory, " "the remaining lock was likely caused by a kill signal or " "a power loss. It can be removed with " "the --unlock argument.".format(os.getcwd())) return False if self.subworkflows and not printdag and not printrulegraph: # backup globals globals_backup = dict(self.globals) # execute subworkflows for subworkflow in self.subworkflows: subworkflow_targets = subworkflow.targets(dag) updated = list() if subworkflow_targets: logger.info( "Executing subworkflow {}.".format(subworkflow.name)) if not subsnakemake(subworkflow.snakefile, workdir=subworkflow.workdir, targets=subworkflow_targets, updated_files=updated): return False dag.updated_subworkflow_files.update(subworkflow.target(f) for f in updated) else: logger.info("Subworkflow {}: Nothing to be done.".format( subworkflow.name)) if self.subworkflows: logger.info("Executing main workflow.") # rescue globals self.globals.update(globals_backup) dag.check_incomplete() dag.postprocess() if nodeps: missing_input = [f for job in dag.targetjobs for f in job.input if dag.needrun(job) and not os.path.exists(f)] if missing_input: logger.error( "Dependency resolution disabled (--nodeps) " "but missing input " "files detected. If this happens on a cluster, please make sure " "that you handle the dependencies yourself or turn of " "--immediate-submit. Missing input files:\n{}".format( "\n".join(missing_input))) return False updated_files.extend(f for job in dag.needrun_jobs for f in job.output) if printd3dag: dag.d3dag() return True elif printdag: print(dag) return True elif printrulegraph: print(dag.rule_dot()) return True elif summary: print("\n".join(dag.summary(detailed=False))) return True elif detailed_summary: print("\n".join(dag.summary(detailed=True))) return True elif list_version_changes: items = list( chain(*map(self.persistence.version_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_code_changes: items = list(chain(*map(self.persistence.code_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_input_changes: items = list(chain(*map(self.persistence.input_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_params_changes: items = list( chain(*map(self.persistence.params_changed, dag.jobs))) if items: print(*items, sep="\n") return True scheduler = JobScheduler(self, dag, cores, local_cores=local_cores, dryrun=dryrun, touch=touch, cluster=cluster, cluster_config=cluster_config, cluster_sync=cluster_sync, jobname=jobname, immediate_submit=immediate_submit, quiet=quiet, keepgoing=keepgoing, drmaa=drmaa, printreason=printreason, printshellcmds=printshellcmds, latency_wait=latency_wait, benchmark_repeats=benchmark_repeats, greediness=greediness) if not dryrun and not quiet: if len(dag): if cluster or cluster_sync or drmaa: logger.resources_info( "Provided cluster nodes: {}".format(nodes)) else: logger.resources_info("Provided cores: {}".format(cores)) logger.resources_info("Rules claiming more threads will be scaled down.") provided_resources = format_resources(resources) if provided_resources: logger.resources_info( "Provided resources: " + provided_resources) ignored_resources = format_resource_names( set(resource for job in dag.needrun_jobs for resource in job.resources_dict if resource not in resources)) if ignored_resources: logger.resources_info( "Ignored resources: " + ignored_resources) logger.run_info("\n".join(dag.stats())) else: logger.info("Nothing to be done.") if dryrun and not len(dag): logger.info("Nothing to be done.") success = scheduler.schedule() if success: if dryrun: if not quiet and len(dag): logger.run_info("\n".join(dag.stats())) elif stats: scheduler.stats.to_json(stats) if not dryrun and not no_hooks: self._onsuccess(logger.get_logfile()) return True else: if not dryrun and not no_hooks: self._onerror(logger.get_logfile()) return False
def workflow_parser(args_dict): """""" # Unlock locked dir and exit if args_dict.get("unlock", False): logger.warning("Unlocking working directory") unlock_dir(workdir=args_dict["workdir"]) sys.exit() # Generate templates and exit if args_dict.get("generate_template", False): logger.warning("Generate template files in working directory") generate_template(workflow_dir=WORKFLOW_DIR, templates=args_dict["generate_template"], workflow=args_dict["subcommand"], workdir=args_dict["workdir"], overwrite=args_dict["overwrite_template"], verbose=args_dict["verbose"], quiet=args_dict["quiet"]) sys.exit() # Cluster stuff to simplify options if args_dict["cluster_config"]: logger.warning("INITIALISING WORKFLOW IN CLUSTER MODE") cluster_config_fn = args_dict["cluster_config"] args_dict["local_cores"] = get_yaml_val(yaml_fn=cluster_config_fn, val_name="cluster_cores", default=10000) args_dict["nodes"] = get_yaml_val(yaml_fn=cluster_config_fn, val_name="cluster_nodes", default=500) args_dict["cluster"] = get_yaml_val(yaml_fn=cluster_config_fn, val_name="cluster_cmd") args_dict["config"] = args_dict["cluster_config"] logger.debug("Cores:{} / Nodes:{} / Cluster_cmd:{}".format( args_dict['local_cores'], args_dict['nodes'], args_dict['cluster'])) elif args_dict["config"]: logger.warning("INITIALISING WORKFLOW IN LOCAL MODE") else: logger.error( "A configuration file `--config` or a cluster configuration file `--cluster_config` is required" ) sys.exit() # Get and check config files logger.warning("LOADING CONFIGURATIONS INFO") snakefile = get_snakefile_fn(workflow_dir=WORKFLOW_DIR, workflow=args_dict["subcommand"]) configfile = get_config_fn(config=args_dict["config"]) kwargs = filter_out_options(args_dict) logger.debug(kwargs) # Run Snakemake API try: snakemake(snakefile=snakefile, configfiles=[configfile], use_conda=True, wrapper_prefix=WRAPPER_PREFIX, **kwargs) except TypeError as E: logger.error("Unsupported Option Error. {}".format(E)) sys.exit()
def snakemake(snakefile, listrules=False, list_target_rules=False, cores=1, nodes=1, local_cores=1, resources=dict(), config=dict(), configfile=None, config_args=None, workdir=None, targets=None, dryrun=False, touch=False, forcetargets=False, forceall=False, forcerun=[], prioritytargets=[], stats=None, printreason=False, printshellcmds=False, printdag=False, printrulegraph=False, printd3dag=False, nocolor=False, quiet=False, keepgoing=False, cluster=None, cluster_config=None, cluster_sync=None, drmaa=None, jobname="snakejob.{rulename}.{jobid}.sh", immediate_submit=False, standalone=False, ignore_ambiguity=False, snakemakepath=None, lock=True, unlock=False, cleanup_metadata=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, list_resources=False, summary=False, detailed_summary=False, latency_wait=3, benchmark_repeats=1, wait_for_files=None, print_compilation=False, debug=False, notemp=False, nodeps=False, keep_target_files=False, allowed_rules=None, jobscript=None, timestamp=False, greediness=None, no_hooks=False, overwrite_shellcmd=None, updated_files=None, log_handler=None, keep_logger=False, verbose=False): """Run snakemake on a given snakefile. This function provides access to the whole snakemake functionality. It is not thread-safe. Args: snakefile (str): the path to the snakefile listrules (bool): list rules (default False) list_target_rules (bool): list target rules (default False) cores (int): the number of provided cores (ignored when using cluster support) (default 1) nodes (int): the number of provided cluster nodes (ignored without cluster support) (default 1) local_cores (int): the number of provided local cores if in cluster mode (ignored without cluster support) (default 1) resources (dict): provided resources, a dictionary assigning integers to resource names, e.g. {gpu=1, io=5} (default {}) config (dict): override values for workflow config workdir (str): path to working directory (default None) targets (list): list of targets, e.g. rule or file names (default None) dryrun (bool): only dry-run the workflow (default False) touch (bool): only touch all output files if present (default False) forcetargets (bool): force given targets to be re-created (default False) forceall (bool): force all output files to be re-created (default False) forcerun (list): list of files and rules that shall be re-created/re-executed (default []) prioritytargets (list): list of targets that shall be run with maximum priority (default []) stats (str): path to file that shall contain stats about the workflow execution (default None) printreason (bool): print the reason for the execution of each job (default false) printshellcmds (bool): print the shell command of each job (default False) printdag (bool): print the dag in the graphviz dot language (default False) printrulegraph (bool): print the graph of rules in the graphviz dot language (default False) printd3dag (bool): print a D3.js compatible JSON representation of the DAG (default False) nocolor (bool): do not print colored output (default False) quiet (bool): do not print any default job information (default False) keepgoing (bool): keep goind upon errors (default False) cluster (str): submission command of a cluster or batch system to use, e.g. qsub (default None) cluster_config (str): configuration file for cluster options (default None) cluster_sync (str): blocking cluster submission command (like SGE 'qsub -sync y') (default None) drmaa (str): if not None use DRMAA for cluster support, str specifies native args passed to the cluster when submitting a job jobname (str): naming scheme for cluster job scripts (default "snakejob.{rulename}.{jobid}.sh") immediate_submit (bool): immediately submit all cluster jobs, regardless of dependencies (default False) standalone (bool): kill all processes very rudely in case of failure (do not use this if you use this API) (default False) ignore_ambiguity (bool): ignore ambiguous rules and always take the first possible one (default False) snakemakepath (str): path to the snakemake executable (default None) lock (bool): lock the working directory when executing the workflow (default True) unlock (bool): just unlock the working directory (default False) cleanup_metadata (bool): just cleanup metadata of output files (default False) force_incomplete (bool): force the re-creation of incomplete files (default False) ignore_incomplete (bool): ignore incomplete files (default False) list_version_changes (bool): list output files with changed rule version (default False) list_code_changes (bool): list output files with changed rule code (default False) list_input_changes (bool): list output files with changed input files (default False) list_params_changes (bool): list output files with changed params (default False) summary (bool): list summary of all output files and their status (default False) latency_wait (int): how many seconds to wait for an output file to appear after the execution of a job, e.g. to handle filesystem latency (default 3) benchmark_repeats (int): number of repeated runs of a job if declared for benchmarking (default 1) wait_for_files (list): wait for given files to be present before executing the workflow list_resources (bool): list resources used in the workflow (default False) summary (bool): list summary of all output files and their status (default False). If no option is specified a basic summary will be ouput. If 'detailed' is added as an option e.g --summary detailed, extra info about the input and shell commands will be included detailed_summary (bool): list summary of all input and output files and their status (default False) print_compilation (bool): print the compilation of the snakefile (default False) debug (bool): allow to use the debugger within rules notemp (bool): ignore temp file flags, e.g. do not delete output files marked as temp after use (default False) nodeps (bool): ignore dependencies (default False) keep_target_files (bool): Do not adjust the paths of given target files relative to the working directory. allowed_rules (set): Restrict allowed rules to the given set. If None or empty, all rules are used. jobscript (str): path to a custom shell script template for cluster jobs (default None) timestamp (bool): print time stamps in front of any output (default False) greediness (float): set the greediness of scheduling. This value between 0 and 1 determines how careful jobs are selected for execution. The default value (0.5 if prioritytargets are used, 1.0 else) provides the best speed and still acceptable scheduling quality. overwrite_shellcmd (str): a shell command that shall be executed instead of those given in the workflow. This is for debugging purposes only. updated_files(list): a list that will be filled with the files that are updated or created during the workflow execution verbose(bool): show additional debug output (default False) log_handler (function): redirect snakemake output to this custom log handler, a function that takes a log message dictionary (see below) as its only argument (default None). The log message dictionary for the log handler has to following entries: :level: the log level ("info", "error", "debug", "progress", "job_info") :level="info", "error" or "debug": :msg: the log message :level="progress": :done: number of already executed jobs :total: number of total jobs :level="job_info": :input: list of input files of a job :output: list of output files of a job :log: path to log file of a job :local: whether a job is executed locally (i.e. ignoring cluster) :msg: the job message :reason: the job reason :priority: the job priority :threads: the threads of the job Returns: bool: True if workflow execution was successful. """ if updated_files is None: updated_files = list() if cluster or cluster_sync or drmaa: cores = sys.maxsize else: nodes = sys.maxsize if cluster_config: cluster_config = load_configfile(cluster_config) else: cluster_config = dict() if not keep_logger: setup_logger(handler=log_handler, quiet=quiet, printreason=printreason, printshellcmds=printshellcmds, nocolor=nocolor, stdout=dryrun, debug=verbose, timestamp=timestamp) if greediness is None: greediness = 0.5 if prioritytargets else 1.0 else: if not (0 <= greediness <= 1.0): logger.error("Error: greediness must be a float between 0 and 1.") return False if not os.path.exists(snakefile): logger.error("Error: Snakefile \"{}\" not present.".format(snakefile)) return False snakefile = os.path.abspath(snakefile) cluster_mode = (cluster is not None) + (cluster_sync is not None) + (drmaa is not None) if cluster_mode > 1: logger.error("Error: cluster and drmaa args are mutually exclusive") return False if debug and (cores > 1 or cluster_mode): logger.error( "Error: debug mode cannot be used with more than one core or cluster execution.") return False overwrite_config = dict() if configfile: overwrite_config.update(load_configfile(configfile)) if config: overwrite_config.update(config) if workdir: olddir = os.getcwd() if not os.path.exists(workdir): logger.info( "Creating specified working directory {}.".format(workdir)) os.makedirs(workdir) workdir = os.path.abspath(workdir) os.chdir(workdir) workflow = Workflow(snakefile=snakefile, snakemakepath=snakemakepath, jobscript=jobscript, overwrite_shellcmd=overwrite_shellcmd, overwrite_config=overwrite_config, overwrite_workdir=workdir, overwrite_configfile=configfile, config_args=config_args, debug=debug) if standalone: try: # set the process group os.setpgrp() except: # ignore: if it does not work we can still work without it pass success = True try: workflow.include(snakefile, overwrite_first_rule=True, print_compilation=print_compilation) workflow.check() if not print_compilation: if listrules: workflow.list_rules() elif list_target_rules: workflow.list_rules(only_targets=True) elif list_resources: workflow.list_resources() else: # if not printdag and not printrulegraph: # handle subworkflows subsnakemake = partial(snakemake, cores=cores, nodes=nodes, local_cores=local_cores, resources=resources, dryrun=dryrun, touch=touch, printreason=printreason, printshellcmds=printshellcmds, nocolor=nocolor, quiet=quiet, keepgoing=keepgoing, cluster=cluster, cluster_config=cluster_config, cluster_sync=cluster_sync, drmaa=drmaa, jobname=jobname, immediate_submit=immediate_submit, standalone=standalone, ignore_ambiguity=ignore_ambiguity, snakemakepath=snakemakepath, lock=lock, unlock=unlock, cleanup_metadata=cleanup_metadata, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, latency_wait=latency_wait, benchmark_repeats=benchmark_repeats, verbose=verbose, notemp=notemp, nodeps=nodeps, jobscript=jobscript, timestamp=timestamp, greediness=greediness, no_hooks=no_hooks, overwrite_shellcmd=overwrite_shellcmd, config=config, config_args=config_args, keep_logger=True) success = workflow.execute( targets=targets, dryrun=dryrun, touch=touch, cores=cores, nodes=nodes, local_cores=local_cores, forcetargets=forcetargets, forceall=forceall, forcerun=forcerun, prioritytargets=prioritytargets, quiet=quiet, keepgoing=keepgoing, printshellcmds=printshellcmds, printreason=printreason, printrulegraph=printrulegraph, printdag=printdag, cluster=cluster, cluster_config=cluster_config, cluster_sync=cluster_sync, jobname=jobname, drmaa=drmaa, printd3dag=printd3dag, immediate_submit=immediate_submit, ignore_ambiguity=ignore_ambiguity, stats=stats, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, list_version_changes=list_version_changes, list_code_changes=list_code_changes, list_input_changes=list_input_changes, list_params_changes=list_params_changes, summary=summary, latency_wait=latency_wait, benchmark_repeats=benchmark_repeats, wait_for_files=wait_for_files, detailed_summary=detailed_summary, nolock=not lock, unlock=unlock, resources=resources, notemp=notemp, nodeps=nodeps, keep_target_files=keep_target_files, cleanup_metadata=cleanup_metadata, subsnakemake=subsnakemake, updated_files=updated_files, allowed_rules=allowed_rules, greediness=greediness, no_hooks=no_hooks) except BrokenPipeError: # ignore this exception and stop. It occurs if snakemake output is piped into less and less quits before reading the whole output. # in such a case, snakemake shall stop scheduling and quit with error 1 success = False except (Exception, BaseException) as ex: print_exception(ex, workflow.linemaps) success = False if workdir: os.chdir(olddir) if workflow.persistence: workflow.persistence.unlock() if not keep_logger: logger.cleanup() return success
def schedule(self): """Schedule jobs that are ready, maximizing cpu usage.""" try: while True: # work around so that the wait does not prevent keyboard interrupts # while not self._open_jobs.acquire(False): # time.sleep(1) self._open_jobs.acquire() # obtain needrun and running jobs in a thread-safe way with self._lock: self._finish_jobs() self._error_jobs() needrun = set(self.open_jobs) running = list(self.running) errors = self._errors executor_error = self._executor_error user_kill = self._user_kill # handle errors if user_kill or (not self.keepgoing and errors) or executor_error: if user_kill == "graceful": logger.info( "Will exit after finishing currently running jobs (scheduler)." ) if executor_error: print_exception(executor_error, self.workflow.linemaps) if executor_error or not running: logger.info("Shutting down, this might take some time.") self._executor.shutdown() if not user_kill: logger.error(_ERROR_MSG_FINAL) return False continue # all runnable jobs have finished, normal shutdown if not needrun and (not running or self.workflow.immediate_submit): self._executor.shutdown() if errors: logger.error(_ERROR_MSG_FINAL) # we still have unfinished jobs. this is not good. direct # user to github issue if self.remaining_jobs and not self.keepgoing: logger.error(_ERROR_MSG_ISSUE_823) logger.error( "Remaining jobs:\n" + "\n".join( " - " + str(job) + ": " + ", ".join(job.output) for job in self.remaining_jobs ) ) return False return not errors # continue if no new job needs to be executed if not needrun: continue # select jobs by solving knapsack problem (omit with dryrun) if self.dryrun: run = needrun else: # Reset params and resources because they might still contain TBDs # or old values from before files have been regenerated. # Now, they can be recalculated as all input is present and up to date. for job in needrun: job.reset_params_and_resources() logger.debug( "Resources before job selection: {}".format(self.resources) ) logger.debug( "Ready jobs ({}):\n\t".format(len(needrun)) + "\n\t".join(map(str, needrun)) ) if not self._last_job_selection_empty: logger.info("Select jobs to execute...") run = self.job_selector(needrun) self._last_job_selection_empty = not run logger.debug( "Selected jobs ({}):\n\t".format(len(run)) + "\n\t".join(map(str, run)) ) logger.debug( "Resources after job selection: {}".format(self.resources) ) # update running jobs with self._lock: self.running.update(run) # remove from ready_jobs self.dag.register_running(run) # actually run jobs local_runjobs = [job for job in run if job.is_local] runjobs = [job for job in run if not job.is_local] self.run(local_runjobs, executor=self._local_executor or self._executor) self.run(runjobs) except (KeyboardInterrupt, SystemExit): logger.info( "Terminating processes on user request, this might take some time." ) self._executor.cancel() return False
def samples2metadata_sra(samples: List[str], logger) -> dict: """ Get the required info to continue a seq2science run from a list of samples. - If a sample already exists locally, we only want to know if it is paired-end or single-end. - If a sample does not exist locally - find its corresponding SRX number and all runs that belong to it, - check if they all have the same layout, if not, crash - see if we can download the runs from ena output: dict( "GSM1234": {"layout": "PAIRED", "runs": ["SRR1234", "SRR4321"], "ena_fastq_ftp": {...}, "SRR5678": {"layout": "SINGLE", "runs": ["SRR5678"], ena_fastq_ftp: None, ... ) """ # start with empty dictionary which we fill out later SAMPLEDICT = {sample: dict() for sample in samples} # only continue with public samples db_sra = pysradb.SRAweb() # all samples that are on GEO (GSM numbers), must first be converted to SRA ids (SRX numbers) geo_samples = [sample for sample in samples if sample.startswith("GSM")] # in sample2clean we store the (potential GEO) sample name in a SRA compliant name if len(geo_samples): try: df_geo = db_sra.gsm_to_srx(geo_samples) except: logger.error( "We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers " "are overloaded or slow. Please try again in a bit...\n" "Another possible option is that you try to access samples that do not exist or are protected, and " "seq2science does not support downloading those..\n\n") os._exit(1) # noqa sample2clean = dict( zip(df_geo.experiment_alias, df_geo.experiment_accession)) else: sample2clean = dict() # now add the already SRA compliant names with a reference to itself sample2clean.update( {sample: sample for sample in samples if sample not in geo_samples}) # check our samples on sra try: df_sra = db_sra.sra_metadata(list(sample2clean.values()), detailed=True) except: logger.error( "We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers " "are overloaded or slow. Please try again in a bit...\n" "Another possible option is that you try to access samples that do not exist or are protected, and " "seq2science does not support downloading those..\n\n") os._exit(1) # noqa # keep track of not-supported samples not_supported_formats = ["ABI_SOLID"] not_supported_samples = [] for sample, clean in sample2clean.items(): # table indices idxs = _sample_to_idxs(df_sra, clean) # get all runs that belong to the sample runs = df_sra.loc[idxs].run_accession.tolist() assert len(runs) >= 1 SAMPLEDICT[sample]["runs"] = runs # check if sample is from a supported format for bad_format in not_supported_formats: for real_format in df_sra.loc[idxs].instrument_model_desc.tolist(): if real_format == bad_format: not_supported_samples.append(sample) # get the layout layout = df_sra.loc[idxs].library_layout.tolist() assert len(set( layout)) == 1, f"sample {sample} consists of mixed layouts, bad!" assert layout[0] in ["PAIRED", "SINGLE" ], f"sample {sample} is an unclear layout, bad!" SAMPLEDICT[sample]["layout"] = layout[0] # get the ena url SAMPLEDICT[sample]["ena_fastq_ftp"] = dict() for run in runs: if layout[0] == "SINGLE": SAMPLEDICT[sample]["ena_fastq_ftp"][run] = df_sra[ df_sra.run_accession == run].ena_fastq_ftp.tolist() elif layout[0] == "PAIRED": SAMPLEDICT[sample]["ena_fastq_ftp"][run] = ( df_sra[df_sra.run_accession == run].ena_fastq_ftp_1.tolist() + df_sra[df_sra.run_accession == run].ena_fastq_ftp_2.tolist()) # if any run from a sample is not found on ENA, better be safe, and assume that sample as a whole is not on ENA if any([ any(pd.isna(urls)) for urls in SAMPLEDICT[sample]["ena_fastq_ftp"].values() ]): SAMPLEDICT[sample]["ena_fastq_ftp"] = None # now report single message for all sample(s) that are from a sequencing platform that is not supported assert len(not_supported_samples) == 0, ( f'Sample(s) {", ".join(not_supported_samples)} are not supported by seq2science. Samples that are one of ' f'these formats; [{", ".join(not_supported_formats)}] are not supported.' ) return SAMPLEDICT
def snakemake(snakefile, listrules=False, cores=1, resources=None, workdir=None, targets=None, dryrun=False, touch=False, forcetargets=False, forceall=False, forcerun=None, prioritytargets=None, stats=None, printreason=False, printshellcmds=False, printdag=False, printrulegraph=False, nocolor=False, quiet=False, keepgoing=False, cluster=None, immediate_submit=False, standalone=False, ignore_ambiguity=False, snakemakepath=None, lock=True, unlock=False, cleanup_metadata=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, summary=False, output_wait=3, print_compilation=False, debug=False, notemp=False, nodeps=False, jobscript=None, timestamp=False): """ Run snakemake on a given snakefile. Note: at the moment, this function is not thread-safe! Arguments snakefile -- the snakefile. list -- list rules. jobs -- maximum number of parallel jobs (default: 1). directory -- working directory (default: current directory). rule -- execute this rule (default: first rule in snakefile). dryrun -- print the rules that would be executed, but do not execute them. forcethis -- force the selected rule to be executed forceall -- force all rules to be executed time_measurements -- measure the running times of all rules lock -- lock the working directory """ init_logger(nocolor=nocolor, stdout=dryrun, debug=debug, timestamp=timestamp) if not os.path.exists(snakefile): logger.error("Error: Snakefile \"{}\" not present.".format(snakefile)) return False if workdir: olddir = os.getcwd() workflow = Workflow( snakefile=snakefile, snakemakepath=snakemakepath, jobscript=jobscript) if standalone: try: # set the process group os.setpgrp() except: # ignore: if it does not work we can still work without it pass success = True try: workflow.include(snakefile, workdir=workdir, overwrite_first_rule=True, print_compilation=print_compilation) workflow.check() if not print_compilation: if listrules: workflow.list_rules() else: if not printdag and not printrulegraph: # handle subworkflows subsnakemake = partial( snakemake, cores=cores, resources=resources, dryrun=dryrun, touch=touch, printreason=printreason, printshellcmds=printshellcmds, nocolor=nocolor, quiet=quiet, keepgoing=keepgoing, cluster=cluster, immediate_submit=immediate_submit, standalone=standalone, ignore_ambiguity=ignore_ambiguity, snakemakepath=snakemakepath, lock=lock, unlock=unlock, cleanup_metadata=cleanup_metadata, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, output_wait=output_wait, debug=debug, notemp=notemp, nodeps=nodeps, jobscript=jobscript, timestamp=timestamp) for subworkflow in workflow.subworkflows: logger.warning("Executing subworkflow {}.".format(subworkflow.name)) if not subsnakemake(subworkflow.snakefile, workdir=subworkflow.workdir, targets=subworkflow.targets): success = False if workflow.subworkflows: logger.warning("Executing main workflow.") if success: success = workflow.execute( targets=targets, dryrun=dryrun, touch=touch, cores=cores, forcetargets=forcetargets, forceall=forceall, forcerun=forcerun, prioritytargets=prioritytargets, quiet=quiet, keepgoing=keepgoing, printshellcmds=printshellcmds, printreason=printreason, printrulegraph=printrulegraph, printdag=printdag, cluster=cluster, immediate_submit=immediate_submit, ignore_ambiguity=ignore_ambiguity, workdir=workdir, stats=stats, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, list_version_changes=list_version_changes, list_code_changes=list_code_changes, list_input_changes=list_input_changes, list_params_changes=list_params_changes, summary=summary, output_wait=output_wait, nolock=not lock, unlock=unlock, resources=resources, notemp=notemp, nodeps=nodeps, cleanup_metadata=cleanup_metadata ) except (Exception, BaseException) as ex: print_exception(ex, workflow.linemaps) success = False if workdir: os.chdir(olddir) if workflow.persistence: workflow.persistence.unlock() return success
def print_exception(ex, linemaps): """ Print an error message for a given exception. Arguments ex -- the exception linemaps -- a dict of a dict that maps for each snakefile the compiled lines to source code lines in the snakefile. """ tb = "Full " + "".join(traceback.format_exception(type(ex), ex, ex.__traceback__)) logger.debug(tb) if isinstance(ex, SyntaxError) or isinstance(ex, IndentationError): logger.error(format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.filename, show_traceback=True)) return origin = get_exception_origin(ex, linemaps) if origin is not None: lineno, file = origin logger.error(format_error(ex, lineno, linemaps=linemaps, snakefile=file, show_traceback=True)) return elif isinstance(ex, TokenError): logger.error(format_error(ex, None, show_traceback=False)) elif isinstance(ex, MissingRuleException): logger.error(format_error(ex, None, linemaps=linemaps, snakefile=ex.filename, show_traceback=False)) elif isinstance(ex, RuleException): for e in ex._include + [ex]: if not e.omit: logger.error(format_error(e, e.lineno, linemaps=linemaps, snakefile=e.filename, show_traceback=True)) elif isinstance(ex, WorkflowError): logger.error(format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.snakefile, show_traceback=True)) elif isinstance(ex, KeyboardInterrupt): logger.info("Cancelling snakemake on user request.") else: traceback.print_exception(type(ex), ex, ex.__traceback__)
def print_job_error(self, job): logger.error("Error in job {} while creating output file{} {}.".format( job, "s" if len(job.output) > 1 else "", ", ".join(job.output)))
def _wait_for_jobs(self): """wait for jobs to complete. This means requesting their status, and then marking them as finished when a "done" parameter shows up. Even for finished jobs, the status should still return """ import googleapiclient while True: # always use self.lock to avoid race conditions with self.lock: if not self.wait: return active_jobs = self.active_jobs self.active_jobs = list() still_running = list() # Loop through active jobs and act on status for j in active_jobs: # use self.status_rate_limiter to avoid too many API calls. with self.status_rate_limiter: # https://cloud.google.com/life-sciences/docs/reference/rest/v2beta/projects.locations.operations/get # Get status from projects.locations.operations/get operations = self._api.projects().locations().operations() request = operations.get(name=j.jobname) logger.debug("Checking status for operation {}".format( j.jobid)) try: status = self._retry_request(request) except googleapiclient.errors.HttpError as ex: # Operation name not found, even finished should be found if ex.status == 404: j.error_callback(j.job) continue # Unpredictable server (500) error elif ex.status == 500: logger.error(ex["content"].decode("utf-8")) j.error_callback(j.job) except WorkflowError as ex: print_exception(ex, self.workflow.linemaps) j.error_callback(j.job) continue # The operation is done if status.get("done", False) == True: # Derive success/failure from status codes (prints too) if self._job_was_successful(status): j.callback(j.job) else: self.print_job_error(j.job, jobid=j.jobid) j.error_callback(j.job) # The operation is still running else: still_running.append(j) with self.lock: self.active_jobs.extend(still_running) sleep()
def execute( self, targets=None, dryrun=False, touch=False, cores=1, forcetargets=False, forceall=False, forcerun=None, prioritytargets=None, quiet=False, keepgoing=False, printshellcmds=False, printreason=False, printdag=False, cluster=None, immediate_submit=False, ignore_ambiguity=False, workdir=None, printrulegraph=False, stats=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, summary=False, output_wait=3, nolock=False, unlock=False, resources=None, notemp=False, nodeps=False, cleanup_metadata=None): self.global_resources = dict() if cluster or resources is None else resources self.global_resources["_cores"] = cores def rules(items): return map(self._rules.__getitem__, filter(self.is_rule, items)) def files(items): return map(os.path.relpath, filterfalse(self.is_rule, items)) if workdir is None: workdir = os.getcwd() if self._workdir is None else self._workdir os.chdir(workdir) if not targets: targets = [self.first_rule] if self.first_rule is not None else list() if prioritytargets is None: prioritytargets = list() if forcerun is None: forcerun = list() priorityrules = set(rules(prioritytargets)) priorityfiles = set(files(prioritytargets)) forcerules = set(rules(forcerun)) forcefiles = set(files(forcerun)) targetrules = set(chain( rules(targets), filterfalse(Rule.has_wildcards, priorityrules), filterfalse(Rule.has_wildcards, forcerules))) targetfiles = set(chain(files(targets), priorityfiles, forcefiles)) if forcetargets: forcefiles.update(targetfiles) forcerules.update(targetrules) dag = DAG( self, dryrun=dryrun, targetfiles=targetfiles, targetrules=targetrules, forceall=forceall, forcefiles=forcefiles, forcerules=forcerules, priorityfiles=priorityfiles, priorityrules=priorityrules, ignore_ambiguity=ignore_ambiguity, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, notemp=notemp) self.persistence = Persistence(nolock=nolock, dag=dag) if cleanup_metadata: for f in cleanup_metadata: self.persistence.cleanup_metadata(f) return True dag.init() dag.check_dynamic() if unlock: try: self.persistence.cleanup_locks() logger.warning("Unlocking working directory.") return True except IOError: logger.error("Error: Unlocking the directory {} failed. Maybe " "you don't have the permissions?") return False try: self.persistence.lock() except IOError: logger.critical("Error: Directory cannot be locked. Please make " "sure that no other Snakemake process is trying to create " "the same files in the following directory:\n{}\n" "If you are sure that no other " "instances of snakemake are running on this directory, " "the remaining lock was likely caused by a kill signal or " "a power loss. It can be removed with " "the --unlock argument.".format(os.getcwd())) return False dag.check_incomplete() dag.postprocess() if nodeps: missing_input = [f for job in dag.targetjobs for f in job.input if dag.needrun(job) and not os.path.exists(f)] logger.critical("Dependency resolution disabled (--nodeps) " "but missing input " "files detected. If this happens on a cluster, please make sure " "that you handle the dependencies yourself or turn of " "--immediate-submit. Missing input files:\n{}".format( "\n".join(missing_input))) return False if printdag: print(dag) return True elif printrulegraph: print(dag.rule_dot()) return True elif summary: print("\n".join(dag.summary())) return True elif list_version_changes: items = list(chain( *map(self.persistence.version_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_code_changes: items = list(chain( *map(self.persistence.code_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_input_changes: items = list(chain( *map(self.persistence.input_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_params_changes: items = list(chain( *map(self.persistence.params_changed, dag.jobs))) if items: print(*items, sep="\n") return True scheduler = JobScheduler( self, dag, cores, dryrun=dryrun, touch=touch, cluster=cluster, immediate_submit=immediate_submit, quiet=quiet, keepgoing=keepgoing, printreason=printreason, printshellcmds=printshellcmds, output_wait=output_wait) if not dryrun and not quiet and len(dag): if cluster: logger.warning("Provided cluster nodes: {}".format(cores)) else: logger.warning("Provided cores: {}".format(cores)) logger.warning("\n".join(dag.stats())) success = scheduler.schedule() if success: if dryrun: if not quiet: logger.warning("\n".join(dag.stats())) elif stats: scheduler.stats.to_csv(stats) else: logger.critical( "Exiting because a job execution failed. " "Look above for error message") return False return True