def fastq_join(forward_fname, reverse_fname, output_file, reorder_to=None, options=dict()): """Workflow function for joining (aka stitching) paired-end fastq files with ea-utils' ``fastq-join``. If the ``drop_unpaired`` option is set to True, unpaired forward reads are concatenated to the joined fastq file. :param forward_fname: String; file name for the forward reads. :param reverse_fname: String; file name for the reverse reads. :param output_file: String; file name for the the finished, joined reads. :param reorder_to: String; file name to reorder sequences against :param options: Dictionary; interpreted as command line options to be passed to the wrapped knead_data.py script. No - or -- flags are necessary; the correct - or --t flags are inferred based on the length of the option. For boolean options, use the key/value pattern of { "my-option": "" }. External Dependencies - fastq-join: Part of ea-utils 1.1.2-806 'https://drive.google.com/folderview?id=0B7KhouP0YeRAOTFWWGVFYkFSQjg&usp=sharing' """ drop_unpaired = options.pop('drop_unpaired', False) default_opts = {"o": output_file} default_opts.update(options) opts = dict_to_cmd_opts(default_opts) cmd = "fastq-join " + opts + " " + forward_fname + " " + reverse_fname if '%' in output_file: renamed_output = output_file.replace("%", "join") else: renamed_output = output_file + "join" actions = [cmd] unpaired_forward = renamed_output.replace("join", "un1") if not drop_unpaired and reorder_to: actions.append( "sequence_re-pair -f fastq -t fastq -b %s %s %s > %s" % (reorder_to, renamed_output, unpaired_forward, output_file)) elif not drop_unpaired and not reorder_to: actions.append("cat {} {} > {}".format(unpaired_forward, renamed_output, output_file)) else: actions.append("mv %s %s" % (renamed_output, output_file)) return { "name": "fastq_join: " + output_file, "actions": actions, "file_dep": [forward_fname, reverse_fname], "targets": [output_file] }
def featureCounts(input_sams, output_table, options=dict()): opts = { "a": settings.workflows.subread.annotations, } opts.update(options) opts['o'] = output_table cmd = ("featureCounts" + " " + dict_to_cmd_opts(opts) + " ") def run(): files = [ f for f in input_sams if os.path.exists(f) and os.stat(f).st_size > 0 ] if files: return CmdAction(cmd + " ".join(files), verbose=True).execute() else: open(output_table, 'w').close() return { "name": "featureCounts: " + output_table, "file_dep": input_sams, "targets": [output_table], "actions": [run] }
def pick_denovo_otus(fasta_in, otutab_out, keep_tempfiles=False, strand="plus", log_file=None, resume=False, quiet=False, chimera_standard=None, truncate_opts={}, derep_opts={}, sort_opts={}, cluster_opts={}, chimera_opts={}, map_opts={}): opts = dict(input=fasta_in, output=otutab_out, print_cmd=True) if 'db' in chimera_opts: s = chimera_opts['db'] elif bool(chimera_standard) is True: s = chimera_standard else: s = settings.workflows.usearch.chimera_gold_standard opts['chimera_standard'] = s opts['tmp_dir'] = otutab_out + "_usearch" kvopts = [("truncate_opts", truncate_opts), ("derep_opts", derep_opts), ("sort_opts", sort_opts), ("cluster_opts", cluster_opts), ("chimera_opts", chimera_opts), ("map_opts", map_opts)] for name, value in kvopts: if value: s = " ".join('='.join(pair) for pair in value.iteritems()) opts[name] = "'" + s + "'" if not log_file: log_file = opts['tmp_dir'] + ".log" opts['log_file'] = log_file cmd = "usearch_denovo_otus " + dict_to_cmd_opts(opts) targets = [otutab_out, join(opts['tmp_dir'], "nonchimeric.fa"), log_file] def _run(): ret = CmdAction(cmd).execute() if ret is None or not issubclass(type(ret), Exception): if not keep_tempfiles: for f in os.listdir(opts['tmp_dir']): if f != "nonchimeric.fa" and \ os.path.isfile(join(opts['tmp_dir'], f)): os.remove(join(opts['tmp_dir'], f)) else: for t in targets: if not os.path.exists(t): open(t, 'w').close() return ret return { "name": "usearch_pick_denovo_otus: " + otutab_out, "actions": [run], "file_dep": [fasta_in], "targets": targets, "title": usearch_rusage([fasta_in]) }
def align(maybe_paired_fastq, output_sam, options=dict()): opts = { "unique": "", "hamming": "", "index": settings.workflows.subread.index } opts.update(options) opts['output'] = output_sam if type(maybe_paired_fastq) in (tuple, list): opts['read'], opts['read2'] = maybe_paired_fastq deps = maybe_paired_fastq else: opts['read'] = maybe_paired_fastq deps = [maybe_paired_fastq] cmd = "subread-align " + dict_to_cmd_opts(opts) def run(): if any(os.stat(f).st_size < 1 for f in deps): open(output_sam, 'w').close() else: return CmdAction(cmd, verbose=True).execute() return { "name": "subread_align: " + output_sam, "actions": [run], "file_dep": deps, "targets": [output_sam] }
def sort(input_bam, output_prefix, memory_level="768M", num_threads=1, **kwargs): """Sort a bam file by sequence name with samtools. :param input_bam: String; file name of input bam file. :param output_prefix: String; file name of output sorted bam file without the .bam suffix :keyword memory_level: String; K/M/G human readable amount of ram to give each sorting thread :keyword num_threads: Int or string; number of threads to use when sorting. """ output_file = output_prefix + ".bam" opts = {'n': "", '@': str(num_threads), 'm': memory_level} opts.update(kwargs) cmd = ("samtools sort " + " " + dict_to_cmd_opts(opts) + " " + input_bam + " " + output_prefix) return { "name": "samtools.sort: " + output_file, "file_dep": [input_bam], "actions": [cmd], "targets": [output_file] }
def sequence_pair(seqfname1, seqfname2, outfname1="/dev/null", outfname2="/dev/null", from_format=None, format_to="fastq", options=dict()): extra_options = dict_to_cmd_opts(options) targets = [ target for target in [outfname1, outfname2] if target != "/dev/null" ] if not from_format: from_format = guess_seq_filetype(targets[0]) pair_cmd = ("sequence_pair" " -f {from_format} -t {format_to}" " -1 {r1out} -2 {r2out} ").format(from_format=from_format, format_to=format_to, r1out=outfname1, r2out=outfname2) pair_cmd += extra_options + " {} {}".format(seqfname1, seqfname2) return { "name": "sequence_pair: %s %s" % (outfname1, outfname2), "actions": [pair_cmd], "file_dep": [seqfname1, seqfname2], "targets": targets }
def run(pcoa_cmd=pcoa_cmd): if default_opts['meta'] is True or not default_opts['meta']: default_opts['meta'] = last_meta_name(pcl_fname) if default_opts['id'] is True or not default_opts['id']: default_opts['id'] = sample_id(pcl_fname) pcoa_cmd += dict_to_cmd_opts(default_opts) pcoa_cmd += " " + pcl_fname + " " return CmdAction(pcoa_cmd, verbose=True).execute()
def demultiplex(map_fname, fasta_fname, qual_fname, output_fname, qiime_opts={}): """Workflow to demultiplex a barcoded set of 16S sequences from a single run. This workflow wraps the qiime split_libraries.py script. For information on what the split_libraries.py script does, check out the qiime documentation: - http://qiime.org/tutorials/tutorial.html#assign-samples-to-multiplex-reads - http://qiime.org/scripts/split_libraries.html :param map_fname: String; File path location of the map.txt metdata file :param fasta_fname: String; File path to the input, multiplex, fasta files :param qual_fname: String; File path to the qual file corresponding to ``fasta_fname``. :param output_fname: String; File path to where the demultiplexed reads will be saved in fasta format. :keyword qiime_opts: Dictionary; A dictionary of command line options to be passed to the wrapped split_libraries.py script. No - or -- flags are necessary; the correct - or --t flags are inferred based on the length of the option. For boolean options, use the key/value pattern of { "my-option": "" }. External dependencies: - Qiime 1.8.0: https://github.com/qiime/qiime """ output_dir, output_basename = os.path.split(output_fname) opts = dict_to_cmd_opts(qiime_opts) cmd = ("split_libraries.py"+ " --map="+map_fname+ " --fasta="+fasta_fname+ " --qual="+qual_fname+ " --dir-prefix="+output_dir+ " "+opts) actions = [cmd] if output_basename != "seqs.fna": default_out = os.path.join(output_dir, "seqs.fna") actions.append("mv '%s' '%s'"%(default_out, output_fname)) return { "name": "demultiplex:"+fasta_fname, "actions": actions, "file_dep": [map_fname, fasta_fname, qual_fname], "targets": [output_fname], "title": lambda t: t.name+" Estimated time=%.2f"%( sum(os.stat(f).st_size for f in t.file_dep)/1024./1024/5) }
def assign_taxonomy(in_fasta, out_dir, qiime_opts={}): name = rmext(os.path.basename(in_fasta))+"_tax_assignments.txt" taxonomy_out = os.path.join(out_dir, name) default_opts = dict([ ("r", settings.workflows.sixteen.otu_refseq), ("t", settings.workflows.sixteen.otu_taxonomy), ]+list(qiime_opts.items())) cmd = ("assign_taxonomy.py -i "+in_fasta+" -o "+out_dir+ " "+dict_to_cmd_opts(default_opts)) return { "name" : "assign_taxonomy: "+taxonomy_out, "targets" : [taxonomy_out], "actions" : [cmd], "file_dep" : [default_opts['r'], default_opts['t'], in_fasta] }
def bowtie2_align(infiles_list, output_file, **opts): """Workflow to use bowtie2 to map a list of input sequence files against a bowtie2 database. Additional keyword options are used directly as bowtie2 command-line flags. :param infiles_list: List of strings; File paths to input search queries as sequences in fastq format :param output_file: String; File path to the search results, in sam format. :keyword reference_db: String; File path to the bowtie2 reference db basename. Fed immediately into bowtie2's -x option. :keyword threads: String or int; Number of threads to use when performing the mapping. Uses bowtie2's -p option. External dependencies: - Bowtie2 2.2.1: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml Resource utilization: - Ram: 2.0-3.0G - CPU: 1 core; > 1 core depending on 'threads' option """ all_opts = { # defaults in here "reference_db": settings.workflows.alignment.kegg_bowtie2_db, "threads": 2, } all_opts.update(opts) cmd = ("bowtie2 " + " -x " + all_opts.pop('reference_db') + " -p " + str(all_opts.pop('threads')) + " -U " + ",".join(infiles_list) + " --no-head" + " --very-sensitive" + " " + dict_to_cmd_opts(all_opts) + " > " + output_file) return { "name": "bowtie2_align:" + output_file, "actions": [cmd], "file_dep": infiles_list, "targets": [output_file] }
def stacked_bar_chart(biom_fname, output_dir, qiime_opts=dict()): """Workflow to produce stacked bar charts of biom-formatted taxonomic profiles using QIIME's `summarize_taxa_through_plots.py`. :param biom_fname: String; the file name of a single biom-formatted otu table or taxonomic profile to be visualized. :param output_dir: String; the full path to a directory wherein the summary plots and charts will be placed :keyword qiime_opts: Dictionary; A dictionary of command line options to be passed to the wrapped summarize_taxa_through_plots.py script. No - or -- flags are necessary; the correct - or --t flags are inferred based on the length of the option. For boolean options, use the key/value pattern of { "my-option": "" }. External dependencies - Qiime 1.8.0: https://github.com/qiime/qiime-deploy """ cmd = ("summarize_taxa_through_plots.py " "-i {} -o {} ".format(biom_fname, output_dir)) default_opts = {"force": True} default_opts.update(qiime_opts) opts = dict_to_cmd_opts(default_opts) cmd += opts target = os.path.join(output_dir, addtag(os.path.basename(biom_fname), "L2")) yield { "name": "stacked_bar_chart: " + output_dir, "actions": [cmd], "file_dep": [biom_fname], "targets": [target] }
def to_bam(input_sam, output_bam, threads=1, **kwargs): kwargs['@'] = kwargs.get("@", threads) opts = dict([("b", ""), ("o", output_bam)] + list(kwargs.iteritems())) cmd = ("samtools view " + " " + dict_to_cmd_opts(opts) + " " + input_sam) def _perfhint(task): threads = int(opts.get("@", 1)) mem = 400 # MB size_mb = os.stat(first(task.file_dep)).st_size / 1024. / 1024. rate = 1800. # MB/clock min time = 20 + (size_mb / rate) return ("{n} Estimated mem={mem:.0f} " "time={time:.0f} threads={threads:.0f}").format( n=task.name, mem=mem, time=time, threads=threads) return { "name": "samtools.to_bam: " + output_bam, "file_dep": [input_sam], "targets": [output_bam], "actions": [cmd], "title": _perfhint }
def pick_otus_closed_ref(in_fasta, out_biom, out_tsv=None, non_chimeric_otu_seqs=None, denovo_otu_txt=None, sample_metadata_fname=None, taxonomy_fname=None, ref_fasta=None, keep_tempfiles=False, strand='plus', chimera_standard=None, log_file=None, resume=False, quiet=False, tmp_folder=None, usearch_closed_opts={}, denovo_opts={}): if not tmp_folder: tmp_folder = in_fasta + "_usearch" if not log_file: log_file = in_fasta + "_usearch.log" if not taxonomy_fname: taxonomy_fname = settings.workflows.sixteen.otu_taxonomy if not ref_fasta: ref_fasta = settings.workflows.usearch.otu_db if not out_tsv: out_tsv = out_biom + ".tsv" if 'db' in denovo_opts.get('chimera_opts', {}): chimera = denovo_opts['chimera_opts']['db'] elif 'chimera_standard' in denovo_opts: chimera = denovo_opts['chimera_standard'] elif bool(chimera_standard) is True: chimera = chimera_standard else: chimera = settings.workflows.usearch.chimera_gold_standard opts = dict(input=in_fasta, output=out_tsv, taxonomy=taxonomy_fname, reference=ref_fasta, strand=strand, chimera_standard=chimera, quiet=quiet, print_cmd=True, log_file=log_file, denovo_otu_table=denovo_otu_txt, resume=resume, keep_tempfiles=True, tmp_dir=tmp_folder, otu_sequences=non_chimeric_otu_seqs) kvopts = list(denovo_opts.items()) + [("closed_opts", usearch_closed_opts)] for name, value in kvopts: if value: s = " ".join('='.join(pair) for pair in value.iteritems()) opts[name] = "'" + s + "'" usearch_cmd = "uclust_closed_otus " + dict_to_cmd_opts(opts) biom_cmd = ("biom convert -i " + out_tsv + " -o " + out_biom + " --table-type='OTU Table' --process-obs-metadata=taxonomy" + " --output-metadata-id=taxonomy") if sample_metadata_fname: biom_cmd += " --sample-metadata-fp=" + sample_metadata_fname def _run(): ret = CmdAction(usearch_cmd).execute() if ret is None or not issubclass(type(ret), Exception): ret = CmdAction(biom_cmd).execute() if not keep_tempfiles: for f in os.listdir(opts['tmp_dir']): if f != "nonchimeric.fa" and \ os.path.isfile(join(opts['tmp_dir'], f)): os.remove(join(opts['tmp_dir'], f)) else: for t in targets: if not os.path.exists(t): open(t, 'w').close() return ret file_dep = [in_fasta, taxonomy_fname, ref_fasta, chimera] targets = [ out_biom, out_tsv, join(opts['tmp_dir'], "nonchimeric.fa"), log_file ] yield { "name": "usearch_pick_otus_closed_ref: " + out_biom, "targets": targets, "actions": [_run], "file_dep": file_dep, "title": usearch_rusage(file_dep) }
def humann2(seqfile_in, output_dir, scratch=None, **opts): """Workflow to find pathway and gene lists grouped by organism from raw whole genome shotgun reads. Additional keywords are interpreted as command line options to be passed to the wrapped knead_data.py script. No - or -- flags are necessary; the correct - or --t flags are inferred based on the length of the option. For boolean options, use the key/value pattern of { "my-option": "" }. :param seqfile_in: String; Paths to file to be fed into HUMAnN2. :param output_dir: String; Directory path to where a HUMAnN2 deposits its results. External dependencies: - `HUMAnN2 v0.1.9 <https://bitbucket.org/biobakery/humann2>`_ Resource utilization: - Ram: 4-6G - Time: 1 hr """ default_opts = { "input": seqfile_in, "output": os.path.abspath(output_dir), "o-log": os.path.join(output_dir, "humann2_log.txt"), "memory-use": "minimum", "log-level": "INFO", "remove-temp-output": True, "output-format": "tsv" } default_opts.update(opts) suffix = default_opts['output-format'] def _join(s): file, ext = os.path.splitext(seqfile_in) file = file + '.' + suffix return new_file(addtag(file, s), basedir=default_opts['output']) targets = map(_join, ("genefamilies", "pathcoverage", "pathabundance")) if scratch: old_out = default_opts['output'] default_opts.pop('output', None) dbs = _get_humann2_dbs(default_opts.get("chocophlan", None), default_opts.get("uniref", None)) default_opts.pop('chocophlan', None), default_opts.pop('uniref', None) cmd = "humann2 " + dict_to_cmd_opts(default_opts, longsep=" ") actions = [ """ tdir=$(mktemp -d -p {sdir}); cd ${{tdir}}; mkdir -pv ${{tdir}}/dbs; cp -rv {dbs} ${{tdir}}/dbs/; {humann2} --output ${{tdir}} \ --chocophlan ${{tdir}}/dbs/chocophlan \ --uniref ${{tdir}}/dbs/uniref; mv -iv ${{tdir}}/*.* {final_out}; rm -rvf ${{tdir}}; """.format(sdir=scratch, dbs=" ".join(dbs), humann2=cmd, final_out=old_out) ] else: actions = ["humann2 " + dict_to_cmd_opts(default_opts, longsep=" ")] def _perfhint(task): threads = int(default_opts.get('threads', 1)) insize = os.stat(seqfile_in).st_size # estimated number of million of reads for fastq input file est_reads = insize / 4 / 10e5 return "{n} Estimated mem={mem:.0f} time={time:.0f}, threads={threads:.0f}".format( n=task.name, mem=(750 + (3.5 * log(est_reads))), time=(3.5 + ((2 * est_reads) / threads)) * 60, threads=threads) return { "name": "humann2:" + output_dir, "file_dep": [seqfile_in], "targets": targets, "actions": actions, "title": _perfhint }
def knead_data(infiles, output_basestr, scratch=None, **opts): """Workflow to sanitize host data and otherwise quality filter metagenomic reads. Input sequences are mapped against a host database using bowtie2; any sequences that map back to the host database are discarded. Additional keywords are interpreted as command line options to be passed to the wrapped kneaddata executable. No - or -- flags are necessary; the correct - or --t flags are inferred based on the length of the option. For boolean options, use the key/value pattern of { "my-option": "" }. :param infiles: Iterable of strings; File path to the input sequences. Should be either a one-length or two-length iterable. Two-length iterables are treated as paired-end data. :param output_basestr: String; Path to the directory and base filename where the output cleaned sequences will be saved. In default_opts, the variable "reference-db" refers to the location of the database that contains the settings for the workflows. The location can be passed either by command line or by editing "input/_options/decontaminate.txt" in the skeleton method. Refer to http://huttenhower.sph.harvard.edu/docs/ anadama/your_own_pipeline.html for more information. External dependencies: - `kneaddata <https://bitbucket.org/biobakery/kneaddata>`_ - `bowtie2 <http://bowtie-bio.sourceforge.net/index.shtml>`_ Resource utilization: - RAM: 4 G """ path, base = os.path.split(output_basestr) default_opts = { "output-prefix": base, "output": output_basestr + "_knead", "reference-db": settings.workflows.knead.reference_db, } default_opts.update(opts) db_bases = map(os.path.basename, default_opts['reference-db']) def _targets(nums=[None]): outdir = default_opts['output'] prefix = default_opts['output-prefix'] yield os.path.join(outdir, prefix + ".fastq") for num in nums: for db_base in db_bases: to_join = [prefix, db_base, num, "contam.fastq"] n = "_".join(filter(bool, to_join)) yield os.path.join(outdir, n) def _perfhint(task): threads = int(default_opts.get('threads', 1)) insize = sum(os.stat(f).st_size for f in infiles) dbsize = sum( os.stat(f).st_size for pat in default_opts['reference-db'] for f in glob(pat + "*")) return "{n} Estimated mem={mem:.0f} time={time:.0f}, threads={threads}".format( n=task.name, mem=dbsize / 1024 / 1024 + (1500), time=60 + (insize / 9e8 / (threads)), threads=threads) if type(infiles) in (unicode, str): infiles_list = [infiles] else: infiles_list = list(infiles) if len(infiles_list) > 1: one, two = infiles_list default_opts['input'] = one default_opts['input2'] = two targets = list(_targets(nums=[1, 2])) else: default_opts['input'] = infiles_list[0] targets = list(_targets()) if scratch: db_patterns = " ".join(s + "*" for s in default_opts['reference-db']) db_printf_cmds = " ".join([ '--reference-db "${{tdir}}/dbs/{}"'.format(db) for db in db_bases ]) refs = default_opts.pop("reference-db", None) knead = "kneaddata " + dict_to_cmd_opts(default_opts) if refs: default_opts['reference-db'] = refs cmd = """ tdir=$(mktemp -d -p {sdir}); cd ${{tdir}}; mkdir -pv ${{tdir}}/dbs; cp {db_patterns} ${{tdir}}/dbs/; {knead} {db_printf_cmds}; rm -rvf ${{tdir}}; """.format(sdir=scratch, db_patterns=db_patterns, knead=knead, db_printf_cmds=db_printf_cmds) else: cmd = "kneaddata " + dict_to_cmd_opts(default_opts) return { "name": "kneaddata:" + output_basestr, "targets": targets, "file_dep": infiles_list, "actions": [cmd], "title": _perfhint, }
def metaphlan2(files_list, scratch=None, **opts): """Workflow to perform taxonomic profiling from whole metagenome shotgun sequences. Additional keyword options are used directly as bowtie2 command-line flags. :param files_list: List of strings; File paths to input sequences, in fastq format. External dependencies - Metaphlan2 @tip: https://bitbucket.org/biobakery/metaphlan2 Resource utilization: - Ram: 1.5-3.0G """ def_base = opts.get("output_file") or files_list[0] all_opts = { 'bt2_ps': 'very-sensitive', 'bowtie2db': settings.workflows.metaphlan2.bowtie2db, 'mpa_pkl': settings.workflows.metaphlan2.mpa_pkl, "bowtie2out": new_file(addext(def_base, "bowtie2out.txt")), "output_file": new_file(addext(def_base, "metaphlan2")) } all_opts.update(opts) if 'input_type' not in all_opts: guessed = guess_seq_filetype(files_list[0]) if guessed not in ('fasta', 'fastq'): raise ValueError("Need sequences in fasta or fastq format, " "or provide keyword 'input_type'") all_opts['input_type'] = biopython_to_metaphlan[guessed] targets = [all_opts['output_file'], all_opts['bowtie2out']] if 'biom' in opts: targets.append(opts['biom']) cmd = starters.cat(files_list, guess_from=files_list[0]) if scratch: db, pkl = all_opts['bowtie2db'], all_opts['mpa_pkl'] all_opts.pop('bowtie2db', None), all_opts.pop('mpa_pkl', None) dbbase, pklbase = map(os.path.basename, (db, pkl)) cmd += (" | metaphlan2.py" + " " + dict_to_cmd_opts(all_opts)) actions = [ """ tdir=$(mktemp -d -p {sdir}); cd ${{tdir}}; mkdir -pv ${{tdir}}/dbs; cp {db}* {pkl} ${{tdir}}/dbs; {cmd} --mpa_pkl ${{tdir}}/dbs/{pklbase} \ --bowtie2db ${{tdir}}/dbs/{dbbase}; rm -rvf ${{tdir}}; """.format(sdir=scratch, pkl=pkl, db=db, cmd=cmd, pklbase=pklbase, dbbase=dbbase) ] else: cmd += (" | metaphlan2.py" + " " + dict_to_cmd_opts(all_opts)) actions = [cmd] def _perfhint(task): threads = int(all_opts.get('nproc', 1)) insize = sum(os.stat(f).st_size for f in files_list) return "{n} Estimated mem={mem:.0f} time={time:.0f}, threads={threads:.0f}".format( n=task.name, mem=1.5 * 1024, time=15 + (insize / 1.2e9 / (threads)), threads=threads) return dict( name="metaphlan2:" + all_opts['output_file'], actions=actions, file_dep=files_list, targets=targets, title=_perfhint, )
def pick_otus_open_ref(input_fname, output_dir, verbose=None, qiime_opts={}): """Workflow to perform open-reference OTU picking. Similar to closed-reference OTU picking, this workflow generates a biom-formatted OTU table from demultiplexed 16S reads. This workflow (in general terms) wraps qiime's pick_open_reference_otus.py, which itself wraps either uclust or usearch. Note that uclust and usearch require a fairly large memory footprint (1.5-2.0G in some cases). :param input_fname: String; File path to the input, fasta-formatted 16S sequences :param output_dir: String; Path to the directory where the output OTU table will be saved as 'otu_table.biom'. Other qiime-specific logs will go there, too. :keyword verbose: Boolean: set to true to print the commands that are run as they are run :keyword qiime_opts: Dictionary; A dictionary of command line options to be passed to the wrapped split_libraries.py script. No - or -- flags are necessary; the correct - or --t flags are inferred based on the length of the option. For boolean options, use the key/value pattern of { "my-option": "" }. External dependencies: - Qiime 1.8.0: https://github.com/qiime/qiime-deploy - USEARCH: (only if using the usearch option) http://www.drive5.com/usearch/ Resource utilization: - RAM: >1.5 G """ output_fname = new_file("otu_table.biom", basedir=output_dir) revcomp_fname = new_file( "revcomp.fna", basedir=os.path.dirname(input_fname)) verbose = settings.workflows.verbose if verbose is None else verbose default_opts = { "reference_fp": settings.workflows.sixteen.otu_refseq } default_opts.update(qiime_opts) opts = dict_to_cmd_opts(default_opts) cmd = (" pick_open_reference_otus.py"+ " --input_fp={}"+ " --output_dir="+output_dir+ " -f"+ " "+opts) revcomp_cmd = ("sequence_convert"+ " --format=fasta"+ " --to=fasta "+ " -r"+ " "+input_fname+ " > "+revcomp_fname) def run(targets): strategies.backup( (CmdAction(cmd.format(input_fname),verbose=verbose), strategies.Group( CmdAction(revcomp_cmd), CmdAction(cmd.format(revcomp_fname),verbose=verbose))), extra_conditions = [ lambda ret, output_fname: os.stat(output_fname).st_size == 0 ], output_fname=output_fname ) return { "name": "pick_otus_open_ref:"+input_fname, "actions": [run], "targets": [output_fname], "file_dep": [input_fname], }
def demultiplex_illumina(fastq_fnames, barcode_fnames, map_fname, output_fname, verbose=True, qiime_opts={}): output_dir, output_basename = os.path.split(output_fname) default_opts = { "i": ",".join(fastq_fnames), "b": ",".join(barcode_fnames), "m": map_fname, "o": output_dir } default_opts.update(qiime_opts) opts = dict_to_cmd_opts(default_opts) cmd = "split_libraries_fastq.py " revcomp_map_fname = new_file(addtag(map_fname, "revcomp"), basedir=output_dir) revcomp_opts = default_opts.copy() revcomp_opts['m'] = revcomp_map_fname revcomp_opts = dict_to_cmd_opts(revcomp_opts) def _revcomp(): from anadama.util import deserialize_map_file, serialize_map_file from Bio.Seq import Seq def _reverse(sample): seq = Seq(sample.BarcodeSequence).reverse_complement() return sample._replace(BarcodeSequence=str(seq)) with open(map_fname) as from_map: from_samples = deserialize_map_file(from_map) serialize_map_file( ( _reverse(s) for s in from_samples ), revcomp_map_fname ) default_out = os.path.join(output_dir, "seqs.fna") output_exists = lambda *args, **kwargs: ( not os.path.exists(default_out) or not os.stat(default_out).st_size > 1 ) def run(): return strategies.backup( (CmdAction(cmd+opts, verbose=verbose), strategies.Group( PythonAction(_revcomp), CmdAction(cmd+revcomp_opts,verbose=verbose))), extra_conditions=[output_exists] ) actions = [run] if output_basename != "seqs.fna": actions.append("mv '%s' '%s'"%(default_out, output_fname)) return { "name": "demultiplex_illumina:"+output_fname, "actions": actions, "file_dep": list(fastq_fnames) + list(barcode_fnames) + [map_fname], "targets": [output_fname], "title": lambda t: t.name+" Estimated time=%.2f"%( sum(os.stat(f).st_size for f in t.file_dep)/1024./1024/5) }