def finish(self, job, update_dynamic=True): self._finished.add(job) try: self._ready_jobs.remove(job) except KeyError: pass # mark depending jobs as ready for job_ in self.depending[job]: if self.needrun(job_) and self._ready(job_): self._ready_jobs.add(job_) if update_dynamic and job.dynamic_output: logger.info("Dynamically updating jobs") newjob = self.update_dynamic(job) if newjob: # simulate that this job ran and was finished before self.omitforce.add(newjob) self._needrun.add(newjob) self._finished.add(newjob) self.postprocess() self.handle_protected(newjob) self.handle_touch(newjob) # add finished jobs to len as they are not counted after new postprocess self._len += len(self._finished)
def printjob(self, job): # skip dynamic jobs that will be "executed" only in dryrun mode if self.dag.dynamic(job): return def format_files(job, io, ruleio, dynamicio): for f in io: f_ = ruleio[f] if f in dynamicio: yield "{} (dynamic)".format(f.format_dynamic()) else: yield f priority = self.dag.priority(job) logger.job_info(jobid=self.dag.jobid(job), msg=job.message, name=job.rule.name, local=self.workflow.is_local(job.rule), input=list(format_files(job, job.input, job.ruleio, job.dynamic_input)), output=list(format_files(job, job.output, job.ruleio, job.dynamic_output)), log=list(job.log), benchmark=job.benchmark, wildcards=job.wildcards_dict, reason=str(self.dag.reason(job)), resources=job.resources_dict, priority="highest" if priority == Job.HIGHEST_PRIORITY else priority, threads=job.threads) if job.dynamic_output: logger.info("Subsequent jobs will be added dynamically " "depending on the output of this rule")
def d3dag(self, max_jobs=10000): def node(job): jobid = self.jobid(job) return { "id": jobid, "value": { "jobid": jobid, "label": job.rule.name, "rule": job.rule.name } } def edge(a, b): return {"u": self.jobid(a), "v": self.jobid(b)} jobs = list(self.jobs) if len(jobs) > max_jobs: logger.info( "Job-DAG is too large for visualization (>{} jobs).".format( max_jobs)) else: logger.d3dag(nodes=[node(job) for job in jobs], edges=[edge(dep, job) for job in jobs for dep in self.dependencies[job] if self.needrun(dep)])
def schedule(self): """ Schedule jobs that are ready, maximizing cpu usage. """ try: while True: # work around so that the wait does not prevent keyboard interrupts while not self._open_jobs.wait(1): pass # obtain needrun and running jobs in a thread-safe way with self._lock: needrun = list(self.open_jobs) running = list(self.running) # free the event self._open_jobs.clear() # handle errors if not self.keepgoing and self._errors: logger.info("Will exit after finishing " "currently running jobs.") if not running: self._executor.shutdown() logger.error(_ERROR_MSG_FINAL) return False continue # normal shutdown because all jobs have been finished if not needrun and not running: self._executor.shutdown() if self._errors: logger.error(_ERROR_MSG_FINAL) return not self._errors # continue if no new job needs to be executed if not needrun: continue logger.debug("Resources before job selection: {}".format( self.resources)) logger.debug("Ready jobs ({}):\n\t".format(len(needrun)) + "\n\t".join(map(str, needrun))) # select jobs by solving knapsack problem run = self.job_selector(needrun) logger.debug("Selected jobs ({}):\n\t".format(len(run)) + "\n\t".join(map(str, run))) # update running jobs with self._lock: self.running.update(run) logger.debug( "Resources after job selection: {}".format(self.resources)) # actually run jobs for job in run: self.run(job) except (KeyboardInterrupt, SystemExit): logger.info("Terminating processes on user request.") self._executor.cancel() with self._lock: running = list(self.running) for job in running: job.cleanup() return False
def download_from_remote(self): if self.is_remote and self.remote_object.exists(): logger.info("Downloading from remote: {}".format(self.file)) self.remote_object.download() else: raise RemoteFileException( "The file to be downloaded does not seem to exist remotely.")
def cleanup(self): """ Cleanup output files. """ to_remove = [f for f in self.expanded_output if f.exists] if to_remove: logger.info("Removing output files of failed job {}" " since they might be corrupted:\n{}".format( self, ", ".join(to_remove))) for f in to_remove: f.remove()
def finish_job(self, job): super().finish_job(job) self.stats.report_job_end(job) try: self.workflow.persistence.finished(job) except IOError as e: logger.info("Failed to remove marker file for job started " "({}). Please ensure write permissions for the " "directory {}".format(e, self.workflow.persistence.path))
def _error(self, job): """ Clear jobs and stop the workflow. """ with self._lock: self._errors = True self.running.remove(job) self.failed.add(job) self._free_resources(job) if self.keepgoing: logger.info("Job failed, going on with independent jobs.") self._open_jobs.set()
def _run(self, job, callback=None, error_callback=None): super()._run(job) self.stats.report_job_start(job) try: self.workflow.persistence.started(job) except IOError as e: logger.info( "Failed to set marker file for job started ({}). " "Snakemake will work, but cannot ensure that output files " "are complete in case of a kill signal or power loss. " "Please ensure write permissions for the " "directory {}".format(e, self.workflow.persistence.path))
def change_working_directory(directory=None): """ Change working directory in execution context if provided. """ if directory: try: saved_directory = os.getcwd() logger.info("Changing to shadow directory: {}".format(directory)) os.chdir(directory) yield finally: os.chdir(saved_directory) else: yield
def wait_for_files(files, latency_wait=3): """Wait for given files to be present in filesystem.""" files = list(files) get_missing = lambda: [f for f in files if not os.path.exists(f)] missing = get_missing() if missing: logger.info("Waiting at most {} seconds for missing files.".format( latency_wait)) for _ in range(latency_wait): if not get_missing(): return time.sleep(1) raise IOError("Missing files after {} seconds:\n{}".format( latency_wait, "\n".join(get_missing())))
def cleanup(self): """ Cleanup output files. """ to_remove = [f for f in self.expanded_output if f.exists] to_remove.extend([f for f in self.remote_input if f.exists]) to_remove.extend([f for f in self.remote_output if f.exists_local]) if to_remove: logger.info("Removing output files of failed job {}" " since they might be corrupted:\n{}".format( self, ", ".join(to_remove))) for f in to_remove: f.remove() self.rmdir_empty_remote_dirs()
def print_exception(ex, linemaps): """ Print an error message for a given exception. Arguments ex -- the exception linemaps -- a dict of a dict that maps for each snakefile the compiled lines to source code lines in the snakefile. """ tb = "Full " + "".join(traceback.format_exception(type(ex), ex, ex.__traceback__)) logger.debug(tb) if isinstance(ex, SyntaxError) or isinstance(ex, IndentationError): logger.error(format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.filename, show_traceback=True)) return origin = get_exception_origin(ex, linemaps) if origin is not None: lineno, file = origin logger.error(format_error(ex, lineno, linemaps=linemaps, snakefile=file, show_traceback=True)) return elif isinstance(ex, TokenError): logger.error(format_error(ex, None, show_traceback=False)) elif isinstance(ex, MissingRuleException): logger.error(format_error(ex, None, linemaps=linemaps, snakefile=ex.filename, show_traceback=False)) elif isinstance(ex, RuleException): for e in ex._include + [ex]: if not e.omit: logger.error(format_error(e, e.lineno, linemaps=linemaps, snakefile=e.filename, show_traceback=True)) elif isinstance(ex, WorkflowError): logger.error(format_error(ex, ex.lineno, linemaps=linemaps, snakefile=ex.snakefile, show_traceback=True)) elif isinstance(ex, KeyboardInterrupt): logger.info("Cancelling snakemake on user request.") else: traceback.print_exception(type(ex), ex, ex.__traceback__)
def printjob(self, job): # skip dynamic jobs that will be "executed" only in dryrun mode if self.dag.dynamic(job): return def format_files(job, io, ruleio, dynamicio): for f in io: f_ = ruleio[f] if f in dynamicio: yield "{} (dynamic)".format(f_) else: yield f def format_ruleitem(name, value): return "" if not value else "\t{}: {}".format(name, value) desc = list() if not self.quiet: if job.message: desc.append(job.message) else: desc.append("{}rule {}:".format(self.rule_prefix(job), job.rule.name)) for name, value in ( ("input", ", ".join(format_files( job, job.input, job.ruleio, job.dynamic_input))), ("output", ", ".join(format_files( job, job.output, job.ruleio, job.dynamic_output))), ("log", job.log), ("reason", self.dag.reason(job) if self.printreason else None)): if value: desc.append(format_ruleitem(name, value)) priority = self.dag.priority(job) if priority > 1: desc.append(format_ruleitem( "priority", "highest" if priority == Job.HIGHEST_PRIORITY else priority)) if self.printthreads and job.threads > 1: desc.append(format_ruleitem("threads", job.threads)) if self.printshellcmds and job.shellcmd: desc.append(job.shellcmd) if desc: logger.info("\n".join(desc)) if job.dynamic_output: logger.warning("Subsequent jobs will be added dynamically " "depending on the output of this rule")
def remove(file, remove_non_empty_dir=False): if os.path.exists(file): if os.path.isdir(file): if remove_non_empty_dir: shutil.rmtree(file) else: try: os.removedirs(file) except OSError as e: # skip non empty directories if e.errno == 39: logger.info("Skipped removing empty directory {}".format(e.filename)) else: logger.warning(str(e)) else: os.remove(file)
def include(self, snakefile, overwrite_first_rule=False, print_compilation=False, overwrite_shellcmd=None): """ Include a snakefile. """ # check if snakefile is a path to the filesystem if not urllib.parse.urlparse(snakefile).scheme: if not os.path.isabs(snakefile) and self.included_stack: current_path = os.path.dirname(self.included_stack[-1]) snakefile = os.path.join(current_path, snakefile) # Could still be an url if relative import was used if not urllib.parse.urlparse(snakefile).scheme: snakefile = os.path.abspath(snakefile) # else it could be an url. # at least we don't want to modify the path for clarity. if snakefile in self.included: logger.info("Multiple include of {} ignored".format(snakefile)) return self.included.append(snakefile) self.included_stack.append(snakefile) global workflow workflow = self first_rule = self.first_rule code, linemap, rulecount = parse(snakefile, overwrite_shellcmd=self.overwrite_shellcmd, rulecount=self._rulecount) self._rulecount = rulecount if print_compilation: print(code) # insert the current directory into sys.path # this allows to import modules from the workflow directory sys.path.insert(0, os.path.dirname(snakefile)) self.linemaps[snakefile] = linemap exec(compile(code, snakefile, "exec"), self.globals) if not overwrite_first_rule: self.first_rule = first_rule self.included_stack.pop()
def __init__( self, path, job, caption, env, category, workflow, wildcards_overwrite=None, mode_embedded=True, aux_files=None, name_overwrite=None, ): self.name_overwrite = name_overwrite self.mode_embedded = mode_embedded self.path = path self.target = os.path.basename(path) self.size = os.path.getsize(self.path) logger.info("Adding {} ({:.2g} MB).".format(self.name, self.size / 1e6)) self.raw_caption = caption self.mime, _ = mime_from_file(self.path) self.workflow = workflow h = hashlib.sha256() h.update(path.encode()) self.id = h.hexdigest() self.job = job self._wildcards = ( job.wildcards if wildcards_overwrite is None else wildcards_overwrite ) self.wildcards = logging.format_wildcards(self._wildcards) self.params = ( logging.format_dict(job.params).replace("\n", r"\n").replace('"', r"\"") ) self.category = category self.aux_files = aux_files or [] self.data_uri = self._data_uri() self.png_uri = self._png_uri()
def createSampleFileMapping(self, sample_annotation): """ create a sample file mapping with unique entries of existing files columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ] """ assay_mapping = {'RNA_ID': 'RNA_BAM_FILE', 'DNA_ID': 'DNA_VCF_FILE'} assay_subsets = [] for id_, file_type in assay_mapping.items(): df = sample_annotation[[id_, file_type]].drop_duplicates().copy() df.rename(columns={ id_: 'ID', file_type: 'FILE_PATH' }, inplace=True) df['ASSAY'] = id_ df['FILE_TYPE'] = file_type assay_subsets.append(df) file_mapping = pd.concat(assay_subsets) # cleaning SAMPLE_FILE_MAPPING file_mapping.dropna(inplace=True) existent = [ pathlib.Path(x).exists() for x in file_mapping["FILE_PATH"] ] if sum(existent) < file_mapping.shape[0]: logger.info( "WARNING: there are files in the sample annotation that do not exist" ) file_mapping = file_mapping[existent].drop_duplicates() if file_mapping.shape[0] == 0: raise ValueError( "No files exist in sample annotation. Please check your sample annotation." ) file_mapping.to_csv(self.getProcDataDir() + "/file_mapping.csv", index=False) return file_mapping
def advanced_argument_conversion(arg_dict): """Experimental adjustment of sbatch arguments to the given or default partition.""" # Currently not adjusting for multiple node jobs nodes = int(arg_dict.get("nodes", 1)) if nodes > 1: return arg_dict partition = arg_dict.get("partition", None) or _get_default_partition() constraint = arg_dict.get("constraint", None) ncpus = int(arg_dict.get("cpus-per-task", 1)) runtime = arg_dict.get("time", None) config = _get_cluster_configuration(partition, constraint, arg_dict.get("mem", 0)) mem = arg_dict.get("mem", ncpus * min(config["MEMORY_PER_CPU"])) if mem > max(config["MEMORY"]): logger.info( f"requested memory ({mem}) > max memory ({max(config['MEMORY'])}); " "adjusting memory settings" ) mem = max(config["MEMORY"]) # Calculate available memory as defined by the number of requested # cpus times memory per cpu AVAILABLE_MEM = ncpus * min(config["MEMORY_PER_CPU"]) # Add additional cpus if memory is larger than AVAILABLE_MEM if mem > AVAILABLE_MEM: logger.info( f"requested memory ({mem}) > " f"ncpus x MEMORY_PER_CPU ({AVAILABLE_MEM}); " "trying to adjust number of cpus up" ) ncpus = int(math.ceil(mem / min(config["MEMORY_PER_CPU"]))) if ncpus > max(config["CPUS"]): logger.info( f"ncpus ({ncpus}) > available cpus ({max(config['CPUS'])}); " "adjusting number of cpus down" ) ncpus = min(int(max(config["CPUS"])), ncpus) adjusted_args = {"mem": int(mem), "cpus-per-task": ncpus} # Update time. If requested time is larger than maximum allowed time, reset if runtime: runtime = time_to_minutes(runtime) time_limit = max(config["TIMELIMIT_MINUTES"]) if runtime > time_limit: logger.info( f"time (runtime) > time limit {time_limit}; " "adjusting time down" ) adjusted_args["time"] = time_limit # update and return arg_dict.update(adjusted_args) return arg_dict
def unshadow_output(self, job): """ Move files from shadow directory to real output paths. """ if not job.shadow_dir or not job.expanded_output: return for real_output in chain(job.expanded_output, job.log): shadow_output = job.shadowed_path(real_output).file # Remake absolute symlinks as relative if os.path.islink(shadow_output): dest = os.readlink(shadow_output) if os.path.isabs(dest): rel_dest = os.path.relpath(dest, job.shadow_dir) os.remove(shadow_output) os.symlink(rel_dest, shadow_output) if os.path.realpath(shadow_output) == os.path.realpath( real_output): continue logger.info("Moving shadow output {} to destination {}".format( shadow_output, real_output)) shutil.move(shadow_output, real_output) shutil.rmtree(job.shadow_dir)
def pull(self, dryrun=False): if self.is_local: return if dryrun: logger.info("Singularity image {} will be pulled.".format( self.url)) return logger.debug("Singularity image location: {}".format(self.path)) if not os.path.exists(self.path): logger.info("Pulling singularity image {}.".format(self.url)) try: p = subprocess.check_output([ "singularity", "pull", "--name", "{}.simg".format( self.hash), self.url ], cwd=self._img_dir, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as e: raise WorkflowError("Failed to pull singularity image " "from {}:\n{}".format( self.url, e.stdout.decode()))
def handle_temp(self, job): """ Remove temp files if they are no longer needed. """ if self.notemp: return needed = lambda job_, f: any( f in files for j, files in self.depending[job_].items() if not self.finished(j) and self.needrun(j) and j != job) def unneeded_files(): for job_, files in self.dependencies[job].items(): for f in job_.temp_output & files: if not needed(job_, f): yield f for f in filterfalse(partial(needed, job), job.temp_output): if not f in self.targetfiles: yield f for f in unneeded_files(): logger.info("Removing temporary output file {}.".format(f)) f.remove(remove_non_empty_dir=True)
def fastqc(): # input: expand("data/xenograft-20201201/fastq/{{fq_id}}_{rg}.fq.gz", rg=[1, 2]) # output: expand("results/20201130-week-49/xenograft/fastqc/{{fq_id}}_{rg}_fastqc.{ext}", rg=[1, 2], ext=["html", "zip"]) with TemporaryDirectory() as tempdir: job_label = snakemake.params.get("label") if job_label: logger.info(f"Job started: {job_label}") logger.info(f"Using temporary directory: {tempdir}") cmd = f"fastqc -t {snakemake.threads} -o {tempdir} -f fastq {snakemake.input}" logger.info(cmd) shell(cmd) destdir = os.path.dirname(snakemake.output[0]) logger.info(f"Copying to destination: {destdir}") for f in os.listdir(tempdir): if f.endswith("fastqc.html") or f.endswith("fastqc.zip"): logger.info("Copying {f}") full_path = path.join(tempdir, f) shell("mv {full_path} {destdir}")
def _handle_error(self, job): """Clear jobs and stop the workflow. If Snakemake is configured to restart jobs then the job might have "restart_times" left and we just decrement and let the scheduler try to run the job again. """ self.get_executor(job).handle_job_error(job) self.running.remove(job) self._free_resources(job) # attempt starts counting from 1, but the first attempt is not # a restart, hence we subtract 1. if job.restart_times > job.attempt - 1: logger.info("Trying to restart job {}.".format(self.dag.jobid(job))) job.attempt += 1 else: self._errors = True self.failed.add(job) if self.keepgoing: logger.info("Job failed, going on with independent jobs.") self._open_jobs.release()
def handle_temp(self, job): """ Remove temp files if they are no longer needed. """ if self.notemp: return needed = lambda job_, f: any( f in files for j, files in self.depending[job_].items() if not self.finished(j) and self.needrun(j) and j != job) def unneeded_files(): for job_, files in self.dependencies[job].items(): for f in job_.temp_output & files: if not needed(job_, f): yield f for f in filterfalse(partial(needed, job), job.temp_output): if not f in self.targetfiles: yield f for f in unneeded_files(): logger.info("Removing temporary output file {}.".format(f)) f.remove()
def migrate_v1_to_v2(self): logger.info("Migrating .snakemake folder to new format...") i = 0 for path, _, filenames in os.walk(self._metadata_path): path = Path(path) for filename in filenames: with open(path / filename, "r") as f: try: record = json.load(f) except json.JSONDecodeError: continue # not a properly formatted JSON file if record.get("incomplete", False): target_path = Path( self._incomplete_path) / path.relative_to( self._metadata_path) os.makedirs(target_path, exist_ok=True) shutil.copyfile( path / filename, target_path / filename, ) i += 1 # this can take a while for large folders... if (i % 10000) == 0 and i > 0: logger.info("{} files migrated".format(i)) logger.info("Migration complete")
def updateParamFiles(self,path,filename,sa_df,param_cols,ID,include): """ path: string. path to where to write the param files (separate from filename to build path if not existing) filename: string. name of file to write in path param_cols: list. list of sample annotation columns to use to determine parameters for that job ID: list. list containing the identifier for the sa_col. either the [sample name] or the [samples in drop group] include: boolean. True- include all of the columns in param_cols to build param file. False- use all other columns in SA """ # build the path to the param file path.mkdir(parents = True,exist_ok = True) param_cols = [col for col in param_cols if col in sa_df.columns] # remove params that are not columns in SA table # take the complement of columns if indicated by !include if not include: param_cols = [col for col in sa_df.columns if col not in param_cols] # designate the TEMP and final param file names true_filename = "{path}/{filename}".format(path = path, filename = filename) # if a file by the desired name exists. if os.path.isfile(true_filename): # replace any strings of nan with "NA" current_SA = sa_df.loc[sa_df["RNA_ID"].isin(ID),param_cols].reset_index(drop = True) current_SA = current_SA.replace("nan","NA").fillna(value = "NA") old_SA = pd.read_csv(true_filename).reset_index(drop = True).fillna(value = "NA") if current_SA.equals(old_SA): pass else: # if they're different remove the existing file and rename TEMP to the desired file. Updating to the current SA table logger.info("{} Param Files do not match. Updating to current Sample Annotation\n".format(filename)) current_SA.to_csv(true_filename, index = False,header = True,na_rep = "NA") # if the param file doesn't exist, just write to the desired file else: logger.info("{} Param File did not already exist. Writing it\n".format(filename)) sa_df.loc[sa_df["RNA_ID"].isin(ID),param_cols].to_csv(true_filename, index = False,header = True,na_rep = "NA")
def wait_for_files(files, latency_wait=3, force_stay_on_remote=False, ignore_pipe=False): """Wait for given files to be present in filesystem.""" files = list(files) def get_missing(): return [ f for f in files if not (f.exists_remote if (isinstance(f, _IOFile) and f.is_remote and (force_stay_on_remote or f.should_stay_on_remote)) else os.path.exists(f) if not (is_flagged(f, "pipe") and ignore_pipe) else True)] missing = get_missing() if missing: logger.info("Waiting at most {} seconds for missing files.".format( latency_wait)) for _ in range(latency_wait): if not get_missing(): return time.sleep(1) raise IOError("Missing files after {} seconds:\n{}".format( latency_wait, "\n".join(get_missing())))
def _error(self, job): """Clear jobs and stop the workflow. If Snakemake is configured to restart jobs then the job might have "restart_times" left and we just decrement and let the scheduler try to run the job again. """ with self._lock: self.running.remove(job) self._free_resources(job) self._open_jobs.set() if job.restart_times > 0: msg = (("Trying to restart job for rule {} with " "wildcards {}").format(job.rule.name, job.wildcards_dict)) logger.info(msg) job.restart_times -= 1 else: self._errors = True self.failed.add(job) if self.keepgoing: logger.info("Job failed, going on with independent jobs.")
def run(self, job, callback=None, submit_callback=None, error_callback=None): super()._run(job) jobscript = self.get_jobscript(job) self.spawn_jobscript(job, jobscript) try: drmaa_args = job.format_wildcards( self.drmaa_args, cluster=self.cluster_wildcards(job)) except AttributeError as e: raise WorkflowError(str(e), rule=job.rule) import drmaa try: jt = self.session.createJobTemplate() jt.remoteCommand = jobscript jt.nativeSpecification = drmaa_args jt.jobName = os.path.basename(jobscript) jobid = self.session.runJob(jt) except (drmaa.errors.InternalException, drmaa.errors.InvalidAttributeValueException) as e: print_exception(WorkflowError("DRMAA Error: {}".format(e)), self.workflow.linemaps) error_callback(job) return logger.info("Submitted DRMAA job (jobid {})".format(jobid)) self.submitted.append(jobid) self.session.deleteJobTemplate(jt) submit_callback(job) with self.lock: self.active_jobs.append( DRMAAClusterJob(job, jobid, callback, error_callback, jobscript))
def cleanup(self): """ Cleanup output files. """ to_remove = [f for f in self.expanded_output if f.exists] to_remove.extend( [ f for f in self.remote_output if ( f.exists_remote if (f.is_remote and f.should_stay_on_remote) else f.exists_local ) ] ) if to_remove: logger.info( "Removing output files of failed job {}" " since they might be corrupted:\n{}".format(self, ", ".join(to_remove)) ) for f in to_remove: f.remove()
def subsetGroups(self, ids_by_group, subset_groups, warn=30, error=10): if subset_groups is None: subset = ids_by_group else: subset_groups = [ subset_groups ] if subset_groups.__class__ == str else subset_groups subset = { gr: ids for gr, ids in ids_by_group.items() if gr in subset_groups } for group in subset_groups: if len(subset[group]) < error: raise ValueError( f'Too few IDs in DROP_GROUP {group}, please ensure that it has at least {error} IDs' ) elif len(subset[group]) < warn: logger.info( f'WARNING: Less than {warn} IDs in DROP_GROUP {group}') return subset
def execute_script(self, fname, edit=None): import nbformat fname_out = self.log.get("notebook", None) if fname_out is None or edit: output_parameter = "" else: fname_out = os.path.join(os.getcwd(), fname_out) output_parameter = "--output {fname_out:q}" if edit is not None: logger.info("Opening notebook for editing.") cmd = ( "jupyter notebook --browser ':' --no-browser --log-level ERROR --ip {edit.ip} --port {edit.port} " "--NotebookApp.quit_button=True {{fname:q}}".format(edit=edit)) else: cmd = ( "jupyter-nbconvert --log-level ERROR --execute {output_parameter} " "--to notebook --ExecutePreprocessor.timeout=-1 {{fname:q}}". format(output_parameter=output_parameter)) if ON_WINDOWS: fname = fname.replace("\\", "/") fname_out = fname_out.replace("\\", "/") if fname_out else fname_out self._execute_cmd(cmd, fname_out=fname_out, fname=fname) if edit: logger.info("Saving modified notebook.") nb = nbformat.read(fname, as_version=4) self.remove_preamble_cell(nb) # clean up all outputs for cell in nb["cells"]: cell["outputs"] = [] nbformat.write(nb, self.local_path)
def code(self): try: from pygments.lexers import get_lexer_by_name from pygments.formatters import HtmlFormatter from pygments import highlight import pygments.util except ImportError: raise WorkflowError( "Python package pygments must be installed to create reports." ) source, language = None, None if self._rule.shellcmd is not None: source = self._rule.shellcmd language = "bash" elif self._rule.script is not None: logger.info("Loading script code for rule {}".format(self.name)) _, source, language = script.get_source( self._rule.script, self._rule.basedir ) source = source.decode() elif self._rule.wrapper is not None: logger.info("Loading wrapper code for rule {}".format(self.name)) _, source, language = script.get_source( wrapper.get_script( self._rule.wrapper, prefix=self._rule.workflow.wrapper_prefix ) ) source = source.decode() try: lexer = get_lexer_by_name(language) return highlight( source, lexer, HtmlFormatter(linenos=True, cssclass="source", wrapcode=True), ) except pygments.util.ClassNotFound: return "<pre><code>source</code></pre>"
def get_config(config_dir, config_name): """ # search order: config_some_profile_name|variant.yaml config_some_profile|variant.yaml config_some|variant.yaml config|variant.yaml config_some_profile_name.yaml config_some_profile.yaml config_some.yaml config.yaml """ config_dict = {} if os.path.isdir(os.path.join(config_dir, 'configs')): config_dir = os.path.join(config_dir, 'configs') try_config_name = copy(config_name) keep_variant = True while not config_dict: try: config_dict = load_config_file(os.path.join(config_dir, try_config_name)) except FileNotFoundError: parent_config_name = get_parent_config_name(try_config_name) if parent_config_name: logger.info(f"{try_config_name} not found, " f"trying {parent_config_name}") try_config_name = parent_config_name else: if keep_variant: name, ext = os.path.splitext(config_name) try_config_name = name.split('|')[0] + ext else: raise FileNotFoundError("No corresponding config file found!") return config_dict
def installRPackages(config: DropConfig = None): logger.info("check for missing R packages") script = Path(drop.__file__).parent / "installRPackages.R" requirements = Path(drop.__file__).parent / 'requirementsR.txt' # install main packages response = subprocess.run(["Rscript", script, requirements], stderr=subprocess.STDOUT) response.check_returncode() # install pipeline depending packages if config is not None: pkg_assembly_name = config.getBSGenomeName() response = subprocess.run(["Rscript", script, pkg_assembly_name], stderr=subprocess.STDOUT) response.check_returncode() pkg_mafdb_name = config.getMafDbName() if pkg_mafdb_name is not None and config.get("mae").get( 'addAF') is True: response = subprocess.run(["Rscript", script, pkg_mafdb_name], stderr=subprocess.STDOUT) response.check_returncode()
def bam_stats(): with TemporaryDirectory() as tempdir: job_label = snakemake.params.get("label") if job_label: logger.info(f"Job started: {job_label}") logger.info(f"Using temporary directory: {tempdir}") nthreads = snakemake.threads total_mem = snakemake.resources.mem_mb logger.info(f"# of cores available: {nthreads}") logger.info(f"Total memory available: {total_mem}MB") shell("samtools stats {snakemake.input.bam} > {tempdir}/stats.txt") shell( "samtools idxstats {snakemake.input.bam} > {tempdir}/idxstats.txt") shell( "samtools flagstats {snakemake.input.bam} > {tempdir}/flagstats.txt" ) logger.info(f"Copying to destination") shell("mv {tempdir}/stats.txt {snakemake.output.stats}") shell("mv {tempdir}/idxstats.txt {snakemake.output.idxstats}") shell("mv {tempdir}/flagstats.txt {snakemake.output.flagstats}")
def store(self, job: Job): """ Store generated job output in the cache. """ with TemporaryDirectory(dir=self.path) as tmpdirname: tmpdir = Path(tmpdirname) for outputfile, cachefile in self.get_outputfiles_and_cachefiles( job): self.check_writeable(cachefile) logger.info( "Moving output file {} to cache.".format(outputfile)) tmp = tmpdir / cachefile.name # First move is performed into a tempdir (it might involve a copy if not on the same FS). # This is important, such that network filesystem latency # does not lead to concurrent writes to the same file. # We can use the plain copy method of shutil, because we do not care about the metadata. shutil.move(outputfile, tmp, copy_function=shutil.copy) # make readable/writeable for all os.chmod( tmp, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH, ) # Move to the actual path (now we are on the same FS, hence move is atomic). # Here we use the default copy function, also copying metadata (which is important here). # It will always work, because we are guaranteed to be in the same FS. shutil.move(tmp, cachefile) # now restore the outputfile via a symlink self.symlink(cachefile, outputfile, utime=False)
def store(self, job: Job): """ Store generated job output in the cache. """ if not os.access(self.path, os.W_OK): raise WorkflowError( "Cannot access cache location {}. Please ensure that " "it is present and writeable.".format(self.path)) with TemporaryDirectory(dir=self.path) as tmpdirname: tmpdir = Path(tmpdirname) for outputfile, cachefile in self.get_outputfiles_and_cachefiles( job): if not os.path.exists(outputfile): raise WorkflowError( "Cannot move output file {} to cache. It does not exist " "(maybe it was not created by the job?).") self.check_writeable(cachefile) logger.info( "Moving output file {} to cache.".format(outputfile)) tmp = tmpdir / cachefile.name # First move is performed into a tempdir (it might involve a copy if not on the same FS). # This is important, such that network filesystem latency # does not lead to concurrent writes to the same file. # We can use the plain copy method of shutil, because we do not care about the metadata. shutil.move(outputfile, tmp, copy_function=shutil.copy) self.set_permissions(tmp) # Move to the actual path (now we are on the same FS, hence move is atomic). # Here we use the default copy function, also copying metadata (which is important here). # It will always work, because we are guaranteed to be in the same FS. shutil.move(tmp, cachefile) # now restore the outputfile via a symlink self.symlink(cachefile, outputfile, utime=False)
def run(self, job, callback=None, submit_callback=None, error_callback=None): super()._run(job) jobscript = self.get_jobscript(job) self.spawn_jobscript(job, jobscript) try: drmaa_args = job.format_wildcards( self.drmaa_args, cluster=self.cluster_wildcards(job)) except AttributeError as e: raise WorkflowError(str(e), rule=job.rule) import drmaa try: jt = self.session.createJobTemplate() jt.remoteCommand = jobscript jt.nativeSpecification = drmaa_args jt.jobName = os.path.basename(jobscript) jobid = self.session.runJob(jt) except (drmaa.errors.InternalException, drmaa.errors.InvalidAttributeValueException) as e: print_exception(WorkflowError("DRMAA Error: {}".format(e)), self.workflow.linemaps) error_callback(job) return logger.info("Submitted DRMAA job (jobid {})".format(jobid)) self.submitted.append(jobid) self.session.deleteJobTemplate(jt) submit_callback(job) with self.lock: self.active_jobs.append(DRMAAClusterJob(job, jobid, callback, error_callback, jobscript))
def createSampleFileMapping(self): """ create a sample file mapping with unique entries of existing files columns: [ID | ASSAY | FILE_TYPE | FILE_PATH ] """ assay_mapping = {'RNA_ID': ['RNA_BAM_FILE', 'GENE_COUNTS_FILE'], 'DNA_ID': ['DNA_VCF_FILE']} assay_subsets = [] for id_, file_types in assay_mapping.items(): for file_type in file_types: df = self.annotationTable[[id_, file_type]].dropna().drop_duplicates().copy() df.rename(columns={id_: 'ID', file_type: 'FILE_PATH'}, inplace=True) df['ASSAY'] = id_ df['FILE_TYPE'] = file_type assay_subsets.append(df) file_mapping = pd.concat(assay_subsets) # cleaning SAMPLE_FILE_MAPPING file_mapping.dropna(inplace=True) file_mapping.drop_duplicates(inplace=True) # check for missing files existing = utils.checkFileExists(file_mapping["FILE_PATH"]) if len(existing) == 0: message = "File mapping is empty. " message += "Please check that all files in your sample annotation exist." raise FileNotFoundError(message) elif len(existing) < file_mapping.shape[0]: missing = set(file_mapping["FILE_PATH"]) - set(existing) logger.info(f"WARNING: {len(missing)} files missing in samples annotation. Ignoring...") logger.debug(f"Missing files: {missing}") file_mapping = file_mapping[file_mapping["FILE_PATH"].isin(existing)] # write file mapping file_mapping.to_csv(self.root / "file_mapping.csv", index=False) return file_mapping
def test_wrappers(args_dict): """""" # Cleanup data and leave if args_dict["clean_output"]: logger.info("Removing output data") for wrapper_name in args_dict["wrappers"]: wrapper_workdir = os.path.join(args_dict["workdir"], wrapper_name) shutil.rmtree(wrapper_workdir, ignore_errors=True) sys.exit() # Test wrappers for wrapper_name in args_dict["wrappers"]: logger.warning("Testing Wrapper {}".format(wrapper_name)) try: snakefile = get_snakefile_fn(workflow_dir=WRAPPER_DIR, workflow=wrapper_name) wrapper_workdir = os.path.join(args_dict["workdir"], wrapper_name) logger.debug("Working in directory: {}".format(wrapper_workdir)) #Run Snakemake through the API snakemake(snakefile=snakefile, workdir=wrapper_workdir, config={"data_dir": DATA_DIR}, wrapper_prefix=WRAPPER_PREFIX, use_conda=True, cores=args_dict["cores"], verbose=args_dict["verbose"], quiet=args_dict["quiet"]) finally: logger.debug("List of file generated: {}".format( os.listdir(wrapper_workdir))) shutil.rmtree(os.path.join(wrapper_workdir, ".snakemake"), ignore_errors=True) if not args_dict["keep_output"]: logger.debug("Removing temporary directory") shutil.rmtree(wrapper_workdir, ignore_errors=True)
def printjob(self, job): # skip dynamic jobs that will be "executed" only in dryrun mode if self.dag.dynamic(job): return def format_files(job, io, ruleio, dynamicio): for f in io: f_ = ruleio[f] if f in dynamicio: yield "{} (dynamic)".format(f.format_dynamic()) else: yield f priority = self.dag.priority(job) logger.job_info(jobid=self.dag.jobid(job), msg=job.message, name=job.rule.name, local=self.workflow.is_local(job.rule), input=list( format_files(job, job.input, job.ruleio, job.dynamic_input)), output=list( format_files(job, job.output, job.ruleio, job.dynamic_output)), log=list(job.log), benchmark=job.benchmark, wildcards=job.wildcards_dict, reason=str(self.dag.reason(job)), resources=job.resources_dict, priority="highest" if priority == Job.HIGHEST_PRIORITY else priority, threads=job.threads) if job.dynamic_output: logger.info("Subsequent jobs will be added dynamically " "depending on the output of this rule")
def query_all_division(args): """collect info from all division into dataframe indexed on organism name """ BAC_REPLACE = [(' sp ', ' sp. '), (' pv ', ' pv. '), (' str ', ' str. '), (' subsp ', ' subsp. '), ('(', ''), (')', '')] if args.kingdoms and not isinstance(args.kingdoms, (list, tuple)): args.kingdoms = args.kingdoms.split(',') else: args.kingdoms = _ens_rest_query(ext="/info/divisions?") SP = {} for k in args.kingdoms: species = _ens_rest_query(ext='/info/species?', params={'division': k})['species'] logger.info('adding {} species from {}'.format(len(species), k)) for s in species: if k == 'EnsemblBacteria': name = s['name'] for old, new in BAC_REPLACE: s['name'] = name.replace(old, new) SP[s['name']] = s df = pd.DataFrame.from_dict(SP, orient='index') return df
def setupTempFiles(config): # create temporary directory if not TMP_DIR.exists(): logger.info(f"create temporary files directory {TMP_DIR}") TMP_DIR.mkdir(parents=True) # save config file CONF_FILE = getConfFile(config) with open(CONF_FILE, 'w') as f: yaml.safe_dump(config.copy(), f) done_files = {} for method in METHODS.keys(): # final rule output file done_file = getMethodPath(method, type_='final_file', str_=False) done_files[method] = str(done_file) # create module tmp Dir if missing tmp_dir = getMethodPath(method, type_='tmp_dir', str_=False) if not tmp_dir.exists(): tmp_dir.mkdir(parents=True) return TMP_DIR, CONF_FILE, done_files
def installRPackages(): logger.info("check for missing R packages") script = pathlib.Path(drop.__file__).parent / "installRPackages.R" requirements = pathlib.Path(drop.__file__).parent / 'requirementsR.txt' #packages = [x.strip().split("#")[0] for x in open(requirements, 'r')] #packages = [x for x in packages if x != ''] #for package in packages: # logger.info(f"check {package}") call = subprocess.Popen(["Rscript", script, requirements], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # check output for errors stdout, stderr = call.communicate() if stderr: print(stderr) exit(1) stdout = stdout.decode() ep = re.compile("Execution halted|^ERROR", re.M) if ep.search(stdout): print(stdout) exit(1)
def remove(file, remove_non_empty_dir=False): if file.is_remote and file.should_stay_on_remote: if file.exists_remote: file.remote_object.remove() elif os.path.isdir(file) and not os.path.islink(file): if remove_non_empty_dir: shutil.rmtree(file) else: try: os.removedirs(file) except OSError as e: # skip non empty directories if e.errno == 39: logger.info("Skipped removing non-empty directory {}".format(e.filename)) else: logger.warning(str(e)) #Remember that dangling symlinks fail the os.path.exists() test, but #we definitely still want to zap them. try/except is the safest way. #Also, we don't want to remove the null device if it is an output. elif os.devnull != str(file): try: os.remove(file) except FileNotFoundError: pass
def writeDependencyFile(): """ Entry point for writing .wBuild.depend. """ #if not wbuildVersionIsCurrent(): # print(bcolors.WARNING + "Version of the project's static .wBuild lib is not the same as the dynamically loaded " # "wBuild" # "version. It is strongly recommended to update .wBuild lib using \'wbuild update\'; " # "otherwise, the consistency of the build can not be guaranteed." + bcolors.ENDC) logger.info("Structuring dependencies...") conf = Config() htmlOutputPath = conf.get("htmlOutputPath") logger.debug("Loaded config.\n html output path (key htmlOutputPath): " + htmlOutputPath + "\n") scriptsPath = conf.get("scriptsPath") wbData = parseWBInfosFromRFiles(script_dir=scriptsPath, htmlPath=htmlOutputPath) mdData = parseMDFiles(script_dir=scriptsPath, htmlPath=htmlOutputPath) dependFile = tempfile.NamedTemporaryFile('w', delete=False) with dependFile as f: #start off with the header f.write('######\n') f.write('#This is a autogenerated snakemake file by wBuild\n') f.write('#wBuild by Leonhard Wachutka\n') f.write('######\n') # write rules for r in wbData: writeRule(r, f) # write md rules for r in mdData: writeMdRule(r, f) # write build index rule writeIndexRule(wbData, mdData, f) logger.info("Dependencies file generated.\n") return (dependFile.name)
def lock_warn_only(self): if self.locked: logger.info( "Error: Directory cannot be locked. This usually " "means that another Snakemake instance is running on this directory." "Another possiblity is that a previous run exited unexpectedly.")
def cancel(self): logger.info("Will exit after finishing currently running jobs.") self.shutdown()
def handle_protected(self, job): """ Write-protect output files that are marked with protected(). """ for f in job.expanded_output: if f in job.protected_output: logger.info("Write-protecting output file {}.".format(f)) f.protect()
def handle_touch(self, job): """ Touches those output files that are marked for touching. """ for f in job.expanded_output: if f in job.touch_output: logger.info("Touching output file {}.".format(f)) f.touch_or_create()
def progress(self): """ Display the progress. """ logger.info("{} of {} steps ({:.0%}) done".format(self.finished_jobs, len(self.dag), self.finished_jobs / len(self.dag)))
def execute(self, targets=None, dryrun=False, touch=False, cores=1, nodes=1, local_cores=1, forcetargets=False, forceall=False, forcerun=None, prioritytargets=None, quiet=False, keepgoing=False, printshellcmds=False, printreason=False, printdag=False, cluster=None, cluster_config=None, cluster_sync=None, jobname=None, immediate_submit=False, ignore_ambiguity=False, printrulegraph=False, printd3dag=False, drmaa=None, stats=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, summary=False, detailed_summary=False, latency_wait=3, benchmark_repeats=3, wait_for_files=None, nolock=False, unlock=False, resources=None, notemp=False, nodeps=False, cleanup_metadata=None, subsnakemake=None, updated_files=None, keep_target_files=False, allowed_rules=None, greediness=1.0, no_hooks=False): self.global_resources = dict() if resources is None else resources self.global_resources["_cores"] = cores self.global_resources["_nodes"] = nodes def rules(items): return map(self._rules.__getitem__, filter(self.is_rule, items)) if keep_target_files: def files(items): return filterfalse(self.is_rule, items) else: def files(items): return map(os.path.relpath, filterfalse(self.is_rule, items)) if not targets: targets = [self.first_rule ] if self.first_rule is not None else list() if prioritytargets is None: prioritytargets = list() if forcerun is None: forcerun = list() priorityrules = set(rules(prioritytargets)) priorityfiles = set(files(prioritytargets)) forcerules = set(rules(forcerun)) forcefiles = set(files(forcerun)) targetrules = set(chain(rules(targets), filterfalse(Rule.has_wildcards, priorityrules), filterfalse(Rule.has_wildcards, forcerules))) targetfiles = set(chain(files(targets), priorityfiles, forcefiles)) if forcetargets: forcefiles.update(targetfiles) forcerules.update(targetrules) rules = self.rules if allowed_rules: rules = [rule for rule in rules if rule.name in set(allowed_rules)] if wait_for_files is not None: try: snakemake.io.wait_for_files(wait_for_files, latency_wait=latency_wait) except IOError as e: logger.error(str(e)) return False dag = DAG( self, rules, dryrun=dryrun, targetfiles=targetfiles, targetrules=targetrules, forceall=forceall, forcefiles=forcefiles, forcerules=forcerules, priorityfiles=priorityfiles, priorityrules=priorityrules, ignore_ambiguity=ignore_ambiguity, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete or printdag or printrulegraph, notemp=notemp) self.persistence = Persistence( nolock=nolock, dag=dag, warn_only=dryrun or printrulegraph or printdag or summary or list_version_changes or list_code_changes or list_input_changes or list_params_changes) if cleanup_metadata: for f in cleanup_metadata: self.persistence.cleanup_metadata(f) return True dag.init() dag.check_dynamic() if unlock: try: self.persistence.cleanup_locks() logger.info("Unlocking working directory.") return True except IOError: logger.error("Error: Unlocking the directory {} failed. Maybe " "you don't have the permissions?") return False try: self.persistence.lock() except IOError: logger.error( "Error: Directory cannot be locked. Please make " "sure that no other Snakemake process is trying to create " "the same files in the following directory:\n{}\n" "If you are sure that no other " "instances of snakemake are running on this directory, " "the remaining lock was likely caused by a kill signal or " "a power loss. It can be removed with " "the --unlock argument.".format(os.getcwd())) return False if self.subworkflows and not printdag and not printrulegraph: # backup globals globals_backup = dict(self.globals) # execute subworkflows for subworkflow in self.subworkflows: subworkflow_targets = subworkflow.targets(dag) updated = list() if subworkflow_targets: logger.info( "Executing subworkflow {}.".format(subworkflow.name)) if not subsnakemake(subworkflow.snakefile, workdir=subworkflow.workdir, targets=subworkflow_targets, updated_files=updated): return False dag.updated_subworkflow_files.update(subworkflow.target(f) for f in updated) else: logger.info("Subworkflow {}: Nothing to be done.".format( subworkflow.name)) if self.subworkflows: logger.info("Executing main workflow.") # rescue globals self.globals.update(globals_backup) dag.check_incomplete() dag.postprocess() if nodeps: missing_input = [f for job in dag.targetjobs for f in job.input if dag.needrun(job) and not os.path.exists(f)] if missing_input: logger.error( "Dependency resolution disabled (--nodeps) " "but missing input " "files detected. If this happens on a cluster, please make sure " "that you handle the dependencies yourself or turn of " "--immediate-submit. Missing input files:\n{}".format( "\n".join(missing_input))) return False updated_files.extend(f for job in dag.needrun_jobs for f in job.output) if printd3dag: dag.d3dag() return True elif printdag: print(dag) return True elif printrulegraph: print(dag.rule_dot()) return True elif summary: print("\n".join(dag.summary(detailed=False))) return True elif detailed_summary: print("\n".join(dag.summary(detailed=True))) return True elif list_version_changes: items = list( chain(*map(self.persistence.version_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_code_changes: items = list(chain(*map(self.persistence.code_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_input_changes: items = list(chain(*map(self.persistence.input_changed, dag.jobs))) if items: print(*items, sep="\n") return True elif list_params_changes: items = list( chain(*map(self.persistence.params_changed, dag.jobs))) if items: print(*items, sep="\n") return True scheduler = JobScheduler(self, dag, cores, local_cores=local_cores, dryrun=dryrun, touch=touch, cluster=cluster, cluster_config=cluster_config, cluster_sync=cluster_sync, jobname=jobname, immediate_submit=immediate_submit, quiet=quiet, keepgoing=keepgoing, drmaa=drmaa, printreason=printreason, printshellcmds=printshellcmds, latency_wait=latency_wait, benchmark_repeats=benchmark_repeats, greediness=greediness) if not dryrun and not quiet: if len(dag): if cluster or cluster_sync or drmaa: logger.resources_info( "Provided cluster nodes: {}".format(nodes)) else: logger.resources_info("Provided cores: {}".format(cores)) logger.resources_info("Rules claiming more threads will be scaled down.") provided_resources = format_resources(resources) if provided_resources: logger.resources_info( "Provided resources: " + provided_resources) ignored_resources = format_resource_names( set(resource for job in dag.needrun_jobs for resource in job.resources_dict if resource not in resources)) if ignored_resources: logger.resources_info( "Ignored resources: " + ignored_resources) logger.run_info("\n".join(dag.stats())) else: logger.info("Nothing to be done.") if dryrun and not len(dag): logger.info("Nothing to be done.") success = scheduler.schedule() if success: if dryrun: if not quiet and len(dag): logger.run_info("\n".join(dag.stats())) elif stats: scheduler.stats.to_json(stats) if not dryrun and not no_hooks: self._onsuccess(logger.get_logfile()) return True else: if not dryrun and not no_hooks: self._onerror(logger.get_logfile()) return False
def list_resources(self): for resource in set( resource for rule in self.rules for resource in rule.resources): if resource not in "_cores _nodes".split(): logger.info(resource)
def upload_to_remote(self): if self.is_remote: logger.info("Uploading to remote: {}".format(self.file)) self.remote_object.upload()
def snakemake(snakefile, listrules=False, list_target_rules=False, cores=1, nodes=1, local_cores=1, resources=dict(), config=dict(), configfile=None, config_args=None, workdir=None, targets=None, dryrun=False, touch=False, forcetargets=False, forceall=False, forcerun=[], prioritytargets=[], stats=None, printreason=False, printshellcmds=False, printdag=False, printrulegraph=False, printd3dag=False, nocolor=False, quiet=False, keepgoing=False, cluster=None, cluster_config=None, cluster_sync=None, drmaa=None, jobname="snakejob.{rulename}.{jobid}.sh", immediate_submit=False, standalone=False, ignore_ambiguity=False, snakemakepath=None, lock=True, unlock=False, cleanup_metadata=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, list_resources=False, summary=False, detailed_summary=False, latency_wait=3, benchmark_repeats=1, wait_for_files=None, print_compilation=False, debug=False, notemp=False, nodeps=False, keep_target_files=False, allowed_rules=None, jobscript=None, timestamp=False, greediness=None, no_hooks=False, overwrite_shellcmd=None, updated_files=None, log_handler=None, keep_logger=False, verbose=False): """Run snakemake on a given snakefile. This function provides access to the whole snakemake functionality. It is not thread-safe. Args: snakefile (str): the path to the snakefile listrules (bool): list rules (default False) list_target_rules (bool): list target rules (default False) cores (int): the number of provided cores (ignored when using cluster support) (default 1) nodes (int): the number of provided cluster nodes (ignored without cluster support) (default 1) local_cores (int): the number of provided local cores if in cluster mode (ignored without cluster support) (default 1) resources (dict): provided resources, a dictionary assigning integers to resource names, e.g. {gpu=1, io=5} (default {}) config (dict): override values for workflow config workdir (str): path to working directory (default None) targets (list): list of targets, e.g. rule or file names (default None) dryrun (bool): only dry-run the workflow (default False) touch (bool): only touch all output files if present (default False) forcetargets (bool): force given targets to be re-created (default False) forceall (bool): force all output files to be re-created (default False) forcerun (list): list of files and rules that shall be re-created/re-executed (default []) prioritytargets (list): list of targets that shall be run with maximum priority (default []) stats (str): path to file that shall contain stats about the workflow execution (default None) printreason (bool): print the reason for the execution of each job (default false) printshellcmds (bool): print the shell command of each job (default False) printdag (bool): print the dag in the graphviz dot language (default False) printrulegraph (bool): print the graph of rules in the graphviz dot language (default False) printd3dag (bool): print a D3.js compatible JSON representation of the DAG (default False) nocolor (bool): do not print colored output (default False) quiet (bool): do not print any default job information (default False) keepgoing (bool): keep goind upon errors (default False) cluster (str): submission command of a cluster or batch system to use, e.g. qsub (default None) cluster_config (str): configuration file for cluster options (default None) cluster_sync (str): blocking cluster submission command (like SGE 'qsub -sync y') (default None) drmaa (str): if not None use DRMAA for cluster support, str specifies native args passed to the cluster when submitting a job jobname (str): naming scheme for cluster job scripts (default "snakejob.{rulename}.{jobid}.sh") immediate_submit (bool): immediately submit all cluster jobs, regardless of dependencies (default False) standalone (bool): kill all processes very rudely in case of failure (do not use this if you use this API) (default False) ignore_ambiguity (bool): ignore ambiguous rules and always take the first possible one (default False) snakemakepath (str): path to the snakemake executable (default None) lock (bool): lock the working directory when executing the workflow (default True) unlock (bool): just unlock the working directory (default False) cleanup_metadata (bool): just cleanup metadata of output files (default False) force_incomplete (bool): force the re-creation of incomplete files (default False) ignore_incomplete (bool): ignore incomplete files (default False) list_version_changes (bool): list output files with changed rule version (default False) list_code_changes (bool): list output files with changed rule code (default False) list_input_changes (bool): list output files with changed input files (default False) list_params_changes (bool): list output files with changed params (default False) summary (bool): list summary of all output files and their status (default False) latency_wait (int): how many seconds to wait for an output file to appear after the execution of a job, e.g. to handle filesystem latency (default 3) benchmark_repeats (int): number of repeated runs of a job if declared for benchmarking (default 1) wait_for_files (list): wait for given files to be present before executing the workflow list_resources (bool): list resources used in the workflow (default False) summary (bool): list summary of all output files and their status (default False). If no option is specified a basic summary will be ouput. If 'detailed' is added as an option e.g --summary detailed, extra info about the input and shell commands will be included detailed_summary (bool): list summary of all input and output files and their status (default False) print_compilation (bool): print the compilation of the snakefile (default False) debug (bool): allow to use the debugger within rules notemp (bool): ignore temp file flags, e.g. do not delete output files marked as temp after use (default False) nodeps (bool): ignore dependencies (default False) keep_target_files (bool): Do not adjust the paths of given target files relative to the working directory. allowed_rules (set): Restrict allowed rules to the given set. If None or empty, all rules are used. jobscript (str): path to a custom shell script template for cluster jobs (default None) timestamp (bool): print time stamps in front of any output (default False) greediness (float): set the greediness of scheduling. This value between 0 and 1 determines how careful jobs are selected for execution. The default value (0.5 if prioritytargets are used, 1.0 else) provides the best speed and still acceptable scheduling quality. overwrite_shellcmd (str): a shell command that shall be executed instead of those given in the workflow. This is for debugging purposes only. updated_files(list): a list that will be filled with the files that are updated or created during the workflow execution verbose(bool): show additional debug output (default False) log_handler (function): redirect snakemake output to this custom log handler, a function that takes a log message dictionary (see below) as its only argument (default None). The log message dictionary for the log handler has to following entries: :level: the log level ("info", "error", "debug", "progress", "job_info") :level="info", "error" or "debug": :msg: the log message :level="progress": :done: number of already executed jobs :total: number of total jobs :level="job_info": :input: list of input files of a job :output: list of output files of a job :log: path to log file of a job :local: whether a job is executed locally (i.e. ignoring cluster) :msg: the job message :reason: the job reason :priority: the job priority :threads: the threads of the job Returns: bool: True if workflow execution was successful. """ if updated_files is None: updated_files = list() if cluster or cluster_sync or drmaa: cores = sys.maxsize else: nodes = sys.maxsize if cluster_config: cluster_config = load_configfile(cluster_config) else: cluster_config = dict() if not keep_logger: setup_logger(handler=log_handler, quiet=quiet, printreason=printreason, printshellcmds=printshellcmds, nocolor=nocolor, stdout=dryrun, debug=verbose, timestamp=timestamp) if greediness is None: greediness = 0.5 if prioritytargets else 1.0 else: if not (0 <= greediness <= 1.0): logger.error("Error: greediness must be a float between 0 and 1.") return False if not os.path.exists(snakefile): logger.error("Error: Snakefile \"{}\" not present.".format(snakefile)) return False snakefile = os.path.abspath(snakefile) cluster_mode = (cluster is not None) + (cluster_sync is not None) + (drmaa is not None) if cluster_mode > 1: logger.error("Error: cluster and drmaa args are mutually exclusive") return False if debug and (cores > 1 or cluster_mode): logger.error( "Error: debug mode cannot be used with more than one core or cluster execution.") return False overwrite_config = dict() if configfile: overwrite_config.update(load_configfile(configfile)) if config: overwrite_config.update(config) if workdir: olddir = os.getcwd() if not os.path.exists(workdir): logger.info( "Creating specified working directory {}.".format(workdir)) os.makedirs(workdir) workdir = os.path.abspath(workdir) os.chdir(workdir) workflow = Workflow(snakefile=snakefile, snakemakepath=snakemakepath, jobscript=jobscript, overwrite_shellcmd=overwrite_shellcmd, overwrite_config=overwrite_config, overwrite_workdir=workdir, overwrite_configfile=configfile, config_args=config_args, debug=debug) if standalone: try: # set the process group os.setpgrp() except: # ignore: if it does not work we can still work without it pass success = True try: workflow.include(snakefile, overwrite_first_rule=True, print_compilation=print_compilation) workflow.check() if not print_compilation: if listrules: workflow.list_rules() elif list_target_rules: workflow.list_rules(only_targets=True) elif list_resources: workflow.list_resources() else: # if not printdag and not printrulegraph: # handle subworkflows subsnakemake = partial(snakemake, cores=cores, nodes=nodes, local_cores=local_cores, resources=resources, dryrun=dryrun, touch=touch, printreason=printreason, printshellcmds=printshellcmds, nocolor=nocolor, quiet=quiet, keepgoing=keepgoing, cluster=cluster, cluster_config=cluster_config, cluster_sync=cluster_sync, drmaa=drmaa, jobname=jobname, immediate_submit=immediate_submit, standalone=standalone, ignore_ambiguity=ignore_ambiguity, snakemakepath=snakemakepath, lock=lock, unlock=unlock, cleanup_metadata=cleanup_metadata, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, latency_wait=latency_wait, benchmark_repeats=benchmark_repeats, verbose=verbose, notemp=notemp, nodeps=nodeps, jobscript=jobscript, timestamp=timestamp, greediness=greediness, no_hooks=no_hooks, overwrite_shellcmd=overwrite_shellcmd, config=config, config_args=config_args, keep_logger=True) success = workflow.execute( targets=targets, dryrun=dryrun, touch=touch, cores=cores, nodes=nodes, local_cores=local_cores, forcetargets=forcetargets, forceall=forceall, forcerun=forcerun, prioritytargets=prioritytargets, quiet=quiet, keepgoing=keepgoing, printshellcmds=printshellcmds, printreason=printreason, printrulegraph=printrulegraph, printdag=printdag, cluster=cluster, cluster_config=cluster_config, cluster_sync=cluster_sync, jobname=jobname, drmaa=drmaa, printd3dag=printd3dag, immediate_submit=immediate_submit, ignore_ambiguity=ignore_ambiguity, stats=stats, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, list_version_changes=list_version_changes, list_code_changes=list_code_changes, list_input_changes=list_input_changes, list_params_changes=list_params_changes, summary=summary, latency_wait=latency_wait, benchmark_repeats=benchmark_repeats, wait_for_files=wait_for_files, detailed_summary=detailed_summary, nolock=not lock, unlock=unlock, resources=resources, notemp=notemp, nodeps=nodeps, keep_target_files=keep_target_files, cleanup_metadata=cleanup_metadata, subsnakemake=subsnakemake, updated_files=updated_files, allowed_rules=allowed_rules, greediness=greediness, no_hooks=no_hooks) except BrokenPipeError: # ignore this exception and stop. It occurs if snakemake output is piped into less and less quits before reading the whole output. # in such a case, snakemake shall stop scheduling and quit with error 1 success = False except (Exception, BaseException) as ex: print_exception(ex, workflow.linemaps) success = False if workdir: os.chdir(olddir) if workflow.persistence: workflow.persistence.unlock() if not keep_logger: logger.cleanup() return success