def load_tool_configfile(self): """Test path of tools_path.yaml on default install path, home or argument""" if USER_TOOLS_PATH.exists() and not ARGS_TOOLS_PATH.exists(): self.tools_config = load_configfile(USER_TOOLS_PATH) elif ARGS_TOOLS_PATH.exists(): self.tools_config = load_configfile(ARGS_TOOLS_PATH) ARGS_TOOLS_PATH.unlink() else: self.tools_config = load_configfile(GIT_TOOLS_PATH)
def load_cluster_config(path): """Load config to dict either from absolute path or relative to profile dir.""" if path: path = os.path.join(os.path.dirname(__file__), os.path.expandvars(path)) default_cluster_config = io.load_configfile(path) else: default_cluster_config = {} if "__default__" not in default_cluster_config: default_cluster_config["__default__"] = {} return default_cluster_config
def sconfig(self, line, cell): " Load JSON or YAML into workflow's config object " workflow = self.get_workflow() # create a temp file, so we can use snakemake.load_configfile # it wouldn't be hard to roll our own to avoid this... cell_config_file = tempfile.NamedTemporaryFile('w', delete=False) cell_config_file.write(cell) cell_config_file.close() snakemake.workflow.config.update(load_configfile( cell_config_file.name)) logger.debug(repr(snakemake.workflow.config)) os.unlink(cell_config_file.name)
def run_workflow(self, workflow="recover_mags", cores=16, profile=None, dryrun=False, conda_frontend="mamba", snakemake_args=""): """Runs the aviary pipeline By default all steps are executed Needs a config-file which is generated by given inputs. Most snakemake arguments can be appended to the command for more info see 'snakemake --help' """ if not os.path.exists(self.config): logging.critical(f"config-file not found: {self.config}\n") sys.exit(1) self.validate_config() conf = load_configfile(self.config) cmd = ( "snakemake --snakefile {snakefile} --directory {working_dir} " "{jobs} --rerun-incomplete " "--configfile '{config_file}' --nolock " " {profile} {conda_frontend} --use-conda {conda_prefix} {dryrun} " " {target_rule} " " {args} ").format( snakefile=get_snakefile(), working_dir=self.output, jobs="--jobs {}".format(cores) if cores is not None else "", config_file=self.config, profile="" if (profile is None) else "--profile {}".format(profile), dryrun="--dryrun" if dryrun else "", args=" ".join(snakemake_args), target_rule=workflow if workflow != "None" else "", conda_prefix="--conda-prefix " + self.conda_prefix, conda_frontend="--conda-frontend " + conda_frontend) logging.info("Executing: %s" % cmd) try: subprocess.check_call(cmd, shell=True) except subprocess.CalledProcessError as e: # removes the traceback logging.critical(e) exit(1)
def __init__(self, workflow, config): # workflow is available only in __init__ # print("\n".join(list(workflow.__dict__.keys()))) # print(workflow.__dict__) self.snakefile = SNAKEFILE self.tools_config = None self.path_config = workflow.overwrite_configfiles[0] self.config = config self.use_env_modules = workflow.use_env_modules self.use_conda = workflow.use_conda self.use_singularity = workflow.use_singularity # test if cluster_config.yaml pass to snakemake if not workflow.overwrite_clusterconfig and get_install_mode( ) == "cluster": self.cluster_config = load_configfile( DEFAULT_PROFILE.joinpath("cluster_config.yaml")) elif not workflow.overwrite_clusterconfig and get_install_mode( ) == "local": self.cluster_config = None else: self.cluster_config = workflow.overwrite_clusterconfig self.load_tool_configfile()
def validate_assembly_config(config): c = load_configfile(config) valid = True if not "samples" in c: logging.critical("'samples' is not defined in %s" % config) valid = False try: if len(c["samples"].keys()) == 0: logging.critical("no samples are defined under 'samples' in %s" % config) valid = False for sample, meta in c["samples"].items(): if sample == "coassemblies": for coassembly, file_list in meta["coassemblies"].items(): for co_sample in file_list: if co_sample not in c["samples"]: logging.critical( "Sample %s under coassembly %s is not a defined sample in the configuration" % (co_sample, coassembly)) valid = False # common/known bad characters if " " in sample or "_" in sample: logging.critical( "The sample ID for %s contains invalid characters; use words or words separated by dashes only" ) valid = False if not "path" in meta: logging.critical("'path' is not set for sample %s" % sample) valid = False continue for f in meta["path"]: if not os.path.exists(f): logging.critical("%s does not exist for sample %s" % (f, sample)) valid = False except KeyError: pass if not "preprocessing" in c: logging.critical("'preprocessing' is not defined in %s" % config) valid = False try: if not "adapters" in c["preprocessing"]: logging.critical( "'adapters' is not defined under 'preprocessing' in %s" % config) valid = False for f in c["preprocessing"]["adapters"].split(","): if not os.path.exists(f): logging.critical("adapters file [%s] does not exist" % f) valid = False if not "contamination" in c["preprocessing"]: logging.critical( "'contamination' is not defined under 'preprocessing' in %s" % config) valid = False if not "references" in c["preprocessing"]["contamination"]: logging.critical( "'references' is not defined under 'contamination' in %s" % config) valid = False if not "rRNA" in c["preprocessing"]["contamination"]["references"]: logging.critical( "'rRNA' is not a defined contamination reference in %s" % config) for ref, f in c["preprocessing"]["contamination"]["references"].items( ): if not os.path.exists(f): logging.critical( "contamination reference file [%s] does not exist for %s" % (f, ref)) valid = False if not "normalization" in c["preprocessing"]: logging.critical("'normalization' is not defined in %s" % config) valid = False except KeyError: pass if not "assembly" in c: logging.critical("'assembly' is not defined in %s" % config) valid = False try: if not c["assembly"]["assembler"] == "megahit" and not c["assembly"][ "assembler"] == "spades": logging.critical( "'assembler' entry [%s] is not a supported assembler" % c["assembly"]["assembler"]) valid = False except KeyError: pass if not "annotation" in c: logging.critical("'annotation' is not defined in %s" % config) valid = False try: if not "references" in c["annotation"]: logging.critical("'references' is not defined in %s" % config) valid = False if "refseq" in c["annotation"]["references"]: if not os.path.exists( c["annotation"]["references"]["refseq"]["namemap"]): logging.critical( "namemap reference file [%s] does not exist" % c["annotation"]["references"]["refseq"]["namemap"]) valid = False if not os.path.exists( c["annotation"]["references"]["refseq"]["tree"]): logging.critical( "tree reference file [%s] does not exist" % c["annotation"]["references"]["refseq"]["tree"]) valid = False if not os.path.exists( c["annotation"]["references"]["refseq"]["dmnd"]): logging.critical( "fasta reference file [%s] does not exist" % c["annotation"]["references"]["refseq"]["fasta"]) valid = False if "cazy" in c["annotation"]["references"]: if not os.path.exists( c["annotation"]["references"]["cazy"]["namemap"]): logging.critical( "namemap reference file [%s] does not exist" % c["annotation"]["references"]["cazy"]["namemap"]) valid = False if not os.path.exists( c["annotation"]["references"]["cazy"]["dmnd"]): logging.critical( "fasta reference file [%s] does not exist" % c["annotation"]["references"]["cazy"]["fasta"]) valid = False if "cog" in c["annotation"]["references"]: if not os.path.exists( c["annotation"]["references"]["cog"]["namemap"]): logging.critical( "namemap reference file [%s] does not exist" % c["annotation"]["references"]["cog"]["namemap"]) valid = False if not os.path.exists( c["annotation"]["references"]["cog"]["dmnd"]): logging.critical("fasta reference file [%s] does not exist" % c["annotation"]["references"]["cog"]["fasta"]) valid = False if "enzyme" in c["annotation"]["references"]: if not os.path.exists( c["annotation"]["references"]["enzyme"]["namemap"]): logging.critical( "namemap reference file [%s] does not exist" % c["annotation"]["references"]["enzyme"]["namemap"]) valid = False if not os.path.exists( c["annotation"]["references"]["enzyme"]["dmnd"]): logging.critical( "fasta reference file [%s] does not exist" % c["annotation"]["references"]["enzyme"]["fasta"]) valid = False if "eggnog" in c["annotation"]["references"]: if not os.path.exists( c["annotation"]["references"]["eggnog"]["namemap"]): logging.critical( "namemap reference file [%s] does not exist" % c["annotation"]["references"]["eggnog"]["namemap"]) valid = False if not os.path.exists( c["annotation"]["references"]["eggnog"]["dmnd"]): logging.critical( "fasta reference file [%s] does not exist" % c["annotation"]["references"]["eggnog"]["fasta"]) valid = False except KeyError: valid = False pass return valid
help='pre-processed by HVG filtering', action='store_true') parser.add_argument('-s', '--scale', action='store_true', help='pre-processed by scaling') args = parser.parse_args() config = args.config task = args.task hvgs = args.hvgs scale = args.scale method = args.method # Load config file params = load_configfile(config) # Check inputs if method not in params['METHODS']: raise ValueError( f'{method} is not a valid method.\n' f'Please choose one of: {list(params["METHODS"].keys())}') if task not in params['DATA_SCENARIOS']: raise ValueError( f'{task} is not a valid integration task.\n' f'Please choose one of: {list(params["DATA_SCENARIOS"].keys())}') # Get path values folder = params['ROOT'] t_folder = task
def validate_config(config, workflow): conf = load_configfile(config)
#!/usr/bin/env python3 import os import pandas as pd from snakemake.io import load_configfile config = load_configfile("config.yaml") # make a file with col1 as contig group name and col2 as a commma separated list of contigs. This was written for use with GATK for splitting the variant calling steps by chromosome/contig. We group the unplaced contigs together since those tend to be small. fai_file = config["ref"]["fai"] contigs_file = "grouped_contigs.tsv" os.system("perl -e 'print qq/name\tcontigs\n/' >" + contigs_file) os.system("grep -Po '(^chr\S+|^\d+)' " + fai_file + " | perl -lne 'print qq/$_\t$_/' >>" + contigs_file) os.system( "grep -Pv '(^chr\S+|^\d+)' " + fai_file + " | cut -f1 | perl -npe 's/\n/,/g' | perl -lne 's/,$//; print qq/unplaced_contigs\t$_/' >>" + contigs_file) contig_groups = pd.read_table(contigs_file) # check chromosomes/contigs parsed correctly. num_contigs_fai = sum(1 for line in open(fai_file)) num_contigs_parsed = contig_groups.shape[0] + contig_groups['contigs'][ contig_groups.shape[0] - 1].count(',') assert num_contigs_fai == num_contigs_parsed, "Chromosomes in .fai not parsed correctly."
def snakemake(snakefile, listrules=False, list_target_rules=False, cores=1, nodes=1, local_cores=1, resources=dict(), config=dict(), configfile=None, config_args=None, workdir=None, targets=None, dryrun=False, touch=False, forcetargets=False, forceall=False, forcerun=[], prioritytargets=[], stats=None, printreason=False, printshellcmds=False, printdag=False, printrulegraph=False, printd3dag=False, nocolor=False, quiet=False, keepgoing=False, cluster=None, cluster_config=None, cluster_sync=None, drmaa=None, jobname="snakejob.{rulename}.{jobid}.sh", immediate_submit=False, standalone=False, ignore_ambiguity=False, snakemakepath=None, lock=True, unlock=False, cleanup_metadata=None, force_incomplete=False, ignore_incomplete=False, list_version_changes=False, list_code_changes=False, list_input_changes=False, list_params_changes=False, list_resources=False, summary=False, detailed_summary=False, latency_wait=3, benchmark_repeats=1, wait_for_files=None, print_compilation=False, debug=False, notemp=False, nodeps=False, keep_target_files=False, allowed_rules=None, jobscript=None, timestamp=False, greediness=None, no_hooks=False, overwrite_shellcmd=None, updated_files=None, log_handler=None, keep_logger=False, verbose=False): """Run snakemake on a given snakefile. This function provides access to the whole snakemake functionality. It is not thread-safe. Args: snakefile (str): the path to the snakefile listrules (bool): list rules (default False) list_target_rules (bool): list target rules (default False) cores (int): the number of provided cores (ignored when using cluster support) (default 1) nodes (int): the number of provided cluster nodes (ignored without cluster support) (default 1) local_cores (int): the number of provided local cores if in cluster mode (ignored without cluster support) (default 1) resources (dict): provided resources, a dictionary assigning integers to resource names, e.g. {gpu=1, io=5} (default {}) config (dict): override values for workflow config workdir (str): path to working directory (default None) targets (list): list of targets, e.g. rule or file names (default None) dryrun (bool): only dry-run the workflow (default False) touch (bool): only touch all output files if present (default False) forcetargets (bool): force given targets to be re-created (default False) forceall (bool): force all output files to be re-created (default False) forcerun (list): list of files and rules that shall be re-created/re-executed (default []) prioritytargets (list): list of targets that shall be run with maximum priority (default []) stats (str): path to file that shall contain stats about the workflow execution (default None) printreason (bool): print the reason for the execution of each job (default false) printshellcmds (bool): print the shell command of each job (default False) printdag (bool): print the dag in the graphviz dot language (default False) printrulegraph (bool): print the graph of rules in the graphviz dot language (default False) printd3dag (bool): print a D3.js compatible JSON representation of the DAG (default False) nocolor (bool): do not print colored output (default False) quiet (bool): do not print any default job information (default False) keepgoing (bool): keep goind upon errors (default False) cluster (str): submission command of a cluster or batch system to use, e.g. qsub (default None) cluster_config (str): configuration file for cluster options (default None) cluster_sync (str): blocking cluster submission command (like SGE 'qsub -sync y') (default None) drmaa (str): if not None use DRMAA for cluster support, str specifies native args passed to the cluster when submitting a job jobname (str): naming scheme for cluster job scripts (default "snakejob.{rulename}.{jobid}.sh") immediate_submit (bool): immediately submit all cluster jobs, regardless of dependencies (default False) standalone (bool): kill all processes very rudely in case of failure (do not use this if you use this API) (default False) ignore_ambiguity (bool): ignore ambiguous rules and always take the first possible one (default False) snakemakepath (str): path to the snakemake executable (default None) lock (bool): lock the working directory when executing the workflow (default True) unlock (bool): just unlock the working directory (default False) cleanup_metadata (bool): just cleanup metadata of output files (default False) force_incomplete (bool): force the re-creation of incomplete files (default False) ignore_incomplete (bool): ignore incomplete files (default False) list_version_changes (bool): list output files with changed rule version (default False) list_code_changes (bool): list output files with changed rule code (default False) list_input_changes (bool): list output files with changed input files (default False) list_params_changes (bool): list output files with changed params (default False) summary (bool): list summary of all output files and their status (default False) latency_wait (int): how many seconds to wait for an output file to appear after the execution of a job, e.g. to handle filesystem latency (default 3) benchmark_repeats (int): number of repeated runs of a job if declared for benchmarking (default 1) wait_for_files (list): wait for given files to be present before executing the workflow list_resources (bool): list resources used in the workflow (default False) summary (bool): list summary of all output files and their status (default False). If no option is specified a basic summary will be ouput. If 'detailed' is added as an option e.g --summary detailed, extra info about the input and shell commands will be included detailed_summary (bool): list summary of all input and output files and their status (default False) print_compilation (bool): print the compilation of the snakefile (default False) debug (bool): allow to use the debugger within rules notemp (bool): ignore temp file flags, e.g. do not delete output files marked as temp after use (default False) nodeps (bool): ignore dependencies (default False) keep_target_files (bool): Do not adjust the paths of given target files relative to the working directory. allowed_rules (set): Restrict allowed rules to the given set. If None or empty, all rules are used. jobscript (str): path to a custom shell script template for cluster jobs (default None) timestamp (bool): print time stamps in front of any output (default False) greediness (float): set the greediness of scheduling. This value between 0 and 1 determines how careful jobs are selected for execution. The default value (0.5 if prioritytargets are used, 1.0 else) provides the best speed and still acceptable scheduling quality. overwrite_shellcmd (str): a shell command that shall be executed instead of those given in the workflow. This is for debugging purposes only. updated_files(list): a list that will be filled with the files that are updated or created during the workflow execution verbose(bool): show additional debug output (default False) log_handler (function): redirect snakemake output to this custom log handler, a function that takes a log message dictionary (see below) as its only argument (default None). The log message dictionary for the log handler has to following entries: :level: the log level ("info", "error", "debug", "progress", "job_info") :level="info", "error" or "debug": :msg: the log message :level="progress": :done: number of already executed jobs :total: number of total jobs :level="job_info": :input: list of input files of a job :output: list of output files of a job :log: path to log file of a job :local: whether a job is executed locally (i.e. ignoring cluster) :msg: the job message :reason: the job reason :priority: the job priority :threads: the threads of the job Returns: bool: True if workflow execution was successful. """ if updated_files is None: updated_files = list() if cluster or cluster_sync or drmaa: cores = sys.maxsize else: nodes = sys.maxsize if cluster_config: cluster_config = load_configfile(cluster_config) else: cluster_config = dict() if not keep_logger: setup_logger(handler=log_handler, quiet=quiet, printreason=printreason, printshellcmds=printshellcmds, nocolor=nocolor, stdout=dryrun, debug=verbose, timestamp=timestamp) if greediness is None: greediness = 0.5 if prioritytargets else 1.0 else: if not (0 <= greediness <= 1.0): logger.error("Error: greediness must be a float between 0 and 1.") return False if not os.path.exists(snakefile): logger.error("Error: Snakefile \"{}\" not present.".format(snakefile)) return False snakefile = os.path.abspath(snakefile) cluster_mode = (cluster is not None) + (cluster_sync is not None) + (drmaa is not None) if cluster_mode > 1: logger.error("Error: cluster and drmaa args are mutually exclusive") return False if debug and (cores > 1 or cluster_mode): logger.error( "Error: debug mode cannot be used with more than one core or cluster execution.") return False overwrite_config = dict() if configfile: overwrite_config.update(load_configfile(configfile)) if config: overwrite_config.update(config) if workdir: olddir = os.getcwd() if not os.path.exists(workdir): logger.info( "Creating specified working directory {}.".format(workdir)) os.makedirs(workdir) workdir = os.path.abspath(workdir) os.chdir(workdir) workflow = Workflow(snakefile=snakefile, snakemakepath=snakemakepath, jobscript=jobscript, overwrite_shellcmd=overwrite_shellcmd, overwrite_config=overwrite_config, overwrite_workdir=workdir, overwrite_configfile=configfile, config_args=config_args, debug=debug) if standalone: try: # set the process group os.setpgrp() except: # ignore: if it does not work we can still work without it pass success = True try: workflow.include(snakefile, overwrite_first_rule=True, print_compilation=print_compilation) workflow.check() if not print_compilation: if listrules: workflow.list_rules() elif list_target_rules: workflow.list_rules(only_targets=True) elif list_resources: workflow.list_resources() else: # if not printdag and not printrulegraph: # handle subworkflows subsnakemake = partial(snakemake, cores=cores, nodes=nodes, local_cores=local_cores, resources=resources, dryrun=dryrun, touch=touch, printreason=printreason, printshellcmds=printshellcmds, nocolor=nocolor, quiet=quiet, keepgoing=keepgoing, cluster=cluster, cluster_config=cluster_config, cluster_sync=cluster_sync, drmaa=drmaa, jobname=jobname, immediate_submit=immediate_submit, standalone=standalone, ignore_ambiguity=ignore_ambiguity, snakemakepath=snakemakepath, lock=lock, unlock=unlock, cleanup_metadata=cleanup_metadata, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, latency_wait=latency_wait, benchmark_repeats=benchmark_repeats, verbose=verbose, notemp=notemp, nodeps=nodeps, jobscript=jobscript, timestamp=timestamp, greediness=greediness, no_hooks=no_hooks, overwrite_shellcmd=overwrite_shellcmd, config=config, config_args=config_args, keep_logger=True) success = workflow.execute( targets=targets, dryrun=dryrun, touch=touch, cores=cores, nodes=nodes, local_cores=local_cores, forcetargets=forcetargets, forceall=forceall, forcerun=forcerun, prioritytargets=prioritytargets, quiet=quiet, keepgoing=keepgoing, printshellcmds=printshellcmds, printreason=printreason, printrulegraph=printrulegraph, printdag=printdag, cluster=cluster, cluster_config=cluster_config, cluster_sync=cluster_sync, jobname=jobname, drmaa=drmaa, printd3dag=printd3dag, immediate_submit=immediate_submit, ignore_ambiguity=ignore_ambiguity, stats=stats, force_incomplete=force_incomplete, ignore_incomplete=ignore_incomplete, list_version_changes=list_version_changes, list_code_changes=list_code_changes, list_input_changes=list_input_changes, list_params_changes=list_params_changes, summary=summary, latency_wait=latency_wait, benchmark_repeats=benchmark_repeats, wait_for_files=wait_for_files, detailed_summary=detailed_summary, nolock=not lock, unlock=unlock, resources=resources, notemp=notemp, nodeps=nodeps, keep_target_files=keep_target_files, cleanup_metadata=cleanup_metadata, subsnakemake=subsnakemake, updated_files=updated_files, allowed_rules=allowed_rules, greediness=greediness, no_hooks=no_hooks) except BrokenPipeError: # ignore this exception and stop. It occurs if snakemake output is piped into less and less quits before reading the whole output. # in such a case, snakemake shall stop scheduling and quit with error 1 success = False except (Exception, BaseException) as ex: print_exception(ex, workflow.linemaps) success = False if workdir: os.chdir(olddir) if workflow.persistence: workflow.persistence.unlock() if not keep_logger: logger.cleanup() return success
def validate_config(config, workflow): conf = load_configfile(config) validate_sample_defs(conf, workflow)
def main(root_dir, args): if len(args) < 3: print("python run.py unfinished config.yaml cores") sys.exit(1) unfinished_dir = args[1] config_file = [args[2]] cores = 79 if len(args) <= 3 else int(args[3]) parallel = 10 if len(args) <= 4 else int(args[4]) # load config config = load_configfile(config_file[0]) # set defualt parameter if 'mail' not in config.keys(): config['mail'] = False if 'bark' not in config.keys(): config['bark'] = False # check_config if not check_config(config): sys.exit(1) finished_dir = "finished" duplication_dir = "duplication" metdata_dir = "metadata" if not os.path.exists(metdata_dir): os.makedirs(metdata_dir) # finished dir if not os.path.exists(finished_dir): os.makedirs(finished_dir) # duplication metadir if not os.path.exists(duplication_dir): os.makedirs(duplication_dir) dup_file = ".file_duplication.json" # preprocess sample_files = glob.glob(os.path.join(unfinished_dir, "*.txt")) # remove duplication before running sample_files = remove_duplication(sample_files, dup_file, duplication_dir) # select unfinished files db = "meta_info.sqlite3" if not os.path.exists(db): # create database if not exists df = build_metadata_table(sample_files) df2 = build_sample_table(df, unfinished_dir) table_to_sql(df, table_name="meta", db=db) table_to_sql(df2, table_name="sample", db=db) else: # otherwise, upgrade the db df = build_metadata_table(sample_files, table_name="meta", db=db) if df.shape[0] > 0 and df.shape[1] > 0: table_to_sql(df, table_name="meta", db=db) df2 = build_sample_table(df, unfinished_dir) table_to_sql(df2, table_name="sample", db=db) # get the unfinished meta files df = table_from_sql(table_name="meta", db=db) sample_files = df.loc[df['status'] == 0, 'meta_file'].to_list() sample_files = [os.path.join(unfinished_dir, f) for f in sample_files] sf = get_snakefile(root_dir, "Snakefile") #todo_files = [] while len(sample_files) > 0: for i in range(min(parallel, len(sample_files))): file = sample_files.pop() shutil.move(file, metdata_dir) #todo_files.append(os.path.join( metdata_dir ,os.path.basename(file)) ) run_snakemake(sf, config_file, cores, unlock=True) status = run_snakemake(sf, config_file, cores) # move the finished file to finished if status: contents = "snakemake run successfully" if config['bark']: bark_notification(config['bark_api'], contents) if config['feishu']: feishu_notification(config['feishu_api'], contents) finished_file = glob.glob(os.path.join(metdata_dir, "*.txt")) for _ in range(len(finished_file)): file = finished_file.pop() update_status(file, table_name="meta", db=db) if os.path.isfile( os.path.join(finished_dir, os.path.basename(file))): shutil.copy2(file, finished_dir) os.unlink(file) else: shutil.move(file, finished_dir) else: contents = "snakemake run failed" if config['bark']: bark_notification(config['bark_api'], contents) if config['feishu']: feishu_notification(config['feishu_api'], contents) broken_file = glob.glob(os.path.join(metdata_dir, "*.txt")) for _ in range(len(broken_file)): file = broken_file.pop() shutil.move(file, "unfinished")
def _create_snakemake_dag(snakefile: str, configfiles: Optional[List[str]] = None, **kwargs: Any) -> DAG: """Create ``snakemake.dag.DAG`` instance. The code of this function comes from the Snakemake codebase and is adapted to fullfil REANA purposes of getting the needed metadata. :param snakefile: Path to Snakefile. :type snakefile: string :param configfiles: List of config files paths. :type configfiles: List :param kwargs: Snakemake args. :type kwargs: Any """ overwrite_config = dict() if configfiles is None: configfiles = [] for f in configfiles: # get values to override. Later configfiles override earlier ones. overwrite_config.update(load_configfile(f)) # convert provided paths to absolute paths configfiles = list(map(os.path.abspath, configfiles)) workflow = Workflow( snakefile=snakefile, overwrite_configfiles=configfiles, overwrite_config=overwrite_config, ) workflow.include(snakefile=snakefile, overwrite_first_rule=True) workflow.check() # code copied and adapted from `snakemake.workflow.Workflow.execute()` # in order to build the DAG and calculate the job dependencies. # https://github.com/snakemake/snakemake/blob/75a544ba528b30b43b861abc0ad464db4d6ae16f/snakemake/workflow.py#L525 def rules(items): return map( workflow._rules.__getitem__, filter(workflow.is_rule, items), ) if kwargs.get("keep_target_files"): def files(items): return filterfalse(workflow.is_rule, items) else: def files(items): relpath = (lambda f: f if os.path.isabs(f) or f.startswith( "root://") else os.path.relpath(f)) return map(relpath, filterfalse(workflow.is_rule, items)) if not kwargs.get("targets"): targets = ([workflow.first_rule] if workflow.first_rule is not None else list()) prioritytargets = kwargs.get("prioritytargets", []) forcerun = kwargs.get("forcerun", []) until = kwargs.get("until", []) omit_from = kwargs.get("omit_from", []) priorityrules = set(rules(prioritytargets)) priorityfiles = set(files(prioritytargets)) forcerules = set(rules(forcerun)) forcefiles = set(files(forcerun)) untilrules = set(rules(until)) untilfiles = set(files(until)) omitrules = set(rules(omit_from)) omitfiles = set(files(omit_from)) targetrules = set( chain( rules(targets), filterfalse(Rule.has_wildcards, priorityrules), filterfalse(Rule.has_wildcards, forcerules), filterfalse(Rule.has_wildcards, untilrules), )) targetfiles = set( chain(files(targets), priorityfiles, forcefiles, untilfiles)) dag = DAG( workflow, workflow.rules, targetrules=targetrules, targetfiles=targetfiles, omitfiles=omitfiles, omitrules=omitrules, ) workflow.persistence = Persistence(dag=dag) dag.init() dag.update_checkpoint_dependencies() dag.check_dynamic() return dag
def validate_config(config): conf = load_configfile(config) validate_sample_defs(conf)
def validate_config(self): load_configfile(self.config)
def run_workflow(workflow, working_dir, config_file, jobs, max_mem, profile, dryrun, snakemake_args): """Runs the ATLAS pipline By default all steps are executed but a sub-workflow can be specified. Needs a config-file and expects to find a sample table in the working-directory. Both can be generated with 'atlas init' Most snakemake arguments can be appended to the command for more info see 'snakemake --help' For more details, see: https://metagenome-atlas.readthedocs.io """ logger.info(f"Atlas version: {__version__}") if config_file is None: config_file = os.path.join(working_dir, "config.yaml") if not os.path.exists(config_file): logger.critical(f"config-file not found: {config_file}\n" "generate one with 'atlas init'") exit(1) sample_file = os.path.join(working_dir, "samples.tsv") if not os.path.exists(sample_file): logger.critical(f"sample.tsv not found in the working directory. " "Generate one with 'atlas init'") exit(1) validate_config(config_file, workflow) conf = load_configfile(config_file) db_dir = conf["database_dir"] cmd = ("snakemake --snakefile {snakefile} --directory {working_dir} " "{jobs} --rerun-incomplete " "--configfile '{config_file}' --nolock " " {profile} --use-conda {conda_prefix} {dryrun} " " {max_mem_string} " " --scheduler greedy " " {target_rule} " " {args} ").format( snakefile=get_snakefile(), working_dir=working_dir, jobs="--jobs {}".format(jobs) if jobs is not None else "", config_file=config_file, profile="" if (profile is None) else "--profile {}".format(profile), dryrun="--dryrun" if dryrun else "", args=" ".join(snakemake_args), target_rule=workflow if workflow != "None" else "", conda_prefix="--conda-prefix " + os.path.join(db_dir, "conda_envs"), max_mem_string=handle_max_mem(max_mem, profile), ) logger.debug("Executing: %s" % cmd) try: subprocess.check_call(cmd, shell=True) except subprocess.CalledProcessError as e: # removes the traceback logger.critical(e) exit(1)