def _configure(self): if self.options['infer_pairs'].get('infer'): paired, notpaired = infer_pairs(self.raw_seq_files) self.raw_seq_files = paired + notpaired maybe_tasks = list() for maybe_pair in self.raw_seq_files: is_pair = type(maybe_pair) in (tuple, list) if is_pair: pair, tasks = maybe_convert_to_fastq(maybe_pair, self.products_dir) self.paired_fastq_files.append(pair) maybe_tasks.extend(tasks) elif util.guess_seq_filetype(maybe_pair) == 'bam': prefix = util.new_file(util.rmext(basename(maybe_pair)), basedir=self.products_dir) t = samtools.to_paired_fastq(maybe_pair, prefix) paired, single = t['targets'][:2], t['targets'][2] self.paired_fastq_files.append(paired) self.unpaired_fastq_files.append(single) maybe_tasks.append(t) else: single, tasks = maybe_convert_to_fastq([maybe_pair], self.products_dir) self.unpaired_fastq_files.append(single[0]) maybe_tasks.extend(tasks) for task in maybe_tasks: yield task for pair in self.paired_fastq_files: align_sam = util.new_file(_to_merged(basename(pair[0]), tag="align"), basedir=self.products_dir) align_sam += ".sam" self.align_sams.append(align_sam) yield subread.align(pair, align_sam, self.options.get('subread_align', dict())) for single in self.unpaired_fastq_files: align_sam = util.new_file(util.addtag(basename(single), "align"), basedir=self.products_dir) align_sam += ".sam" self.align_sams.append(align_sam) yield subread.align(single, align_sam, self.options.get('subread_align', dict())) for align_sam in self.align_sams: count_table = util.new_file(util.addtag(basename(align_sam), "count"), basedir=self.products_dir) self.count_tables.append(count_table) yield subread.featureCounts([align_sam], count_table, self.options.get( 'featureCounts', dict()))
def _actually_du(): srv, remote_path = parse_fasp_url(url) bn = basename(remote_path) local_file = join(local_dir, bn) skip = (bn in local_cached and os.stat(local_file).st_size == remote_size) if skip == False: ret = asp.download_file(srv, dcc_user, dcc_pw, remote_path, local_dir) if not ret: raise Exception("Download failed: " + url) to_rm, files_to_upload = untar(local_file) for i, f in enumerate(files_to_upload): new_f = addtag(f, namespace) os.rename(f, new_f) files_to_upload[i] = new_f names_sizes = [(basename(f), os.stat(f).st_size) for f in files_to_upload] for f in files_to_upload: ret = asp.upload_file(ncbi_srv, ncbi_user, None, f, ncbi_path, keyfile=ncbi_keyfile) with open(local_file + "." + namespace + ".complete", 'w') as f: for name_size in names_sizes: print >> f, "\t".join(map(str, name_size)) for f in reversed(to_rm): try: os.rmdir(f) if os.path.isdir(f) else os.remove(f) except: print >> sys.stderr, "Unable to remove " + f
def split_454_style(self, seqfiles_to_split): tasks, demuxed = list(), list() self.sample_metadata = sorted(self.sample_metadata, key=firstitem) for sample_id, sample_group in groupby(self.sample_metadata, firstitem): sample_dir = join(self.products_dir, sample_id) sample_group = list(sample_group) map_fname = util.new_file("map.txt", basedir=sample_dir) tasks.append( sixteen.write_map(sample_group, sample_dir, **self.options.get('write_map', dict()))) files_list = self._filter_files_for_sample(seqfiles_to_split, sample_group) fasta_fname = util.new_file(sample_id + ".fa", basedir=sample_dir) qual_fname = util.new_file(sample_id + ".qual", basedir=sample_dir) tasks.append( general.fastq_split(files_list, fasta_fname, qual_fname, **self.options.get('fastq_split', dict()))) qiime_opts = self.options['demultiplex'].pop('qiime_opts', {}) if 'barcode-type' not in qiime_opts: qiime_opts['barcode-type'] = _determine_barcode_type( sample_group) demuxed_fname = util.addtag(fasta_fname, "demuxed") tasks.append( sixteen.demultiplex(map_fname, fasta_fname, qual_fname, demuxed_fname, qiime_opts=qiime_opts, **self.options.get('demultiplex', dict()))) demuxed.append(demuxed_fname) return demuxed, tasks
def _process_raw_demuxed_fastq_files(self): for fname in self.raw_demuxed_fastq_files: filtered_fname = util.addtag(fname, "filtered") opts = self.options.get('fastq_filter', {}) opts['mangle_to'] = self._filter_samples_for_file( self.sample_metadata, fname)[0][0] yield usearch.filter(fname, filtered_fname, **opts) self.demuxed_fasta_files.append(filtered_fname)
def _configure(self): if self.otu_tables: merged_name = util.addtag(self.otu_tables[0], "merged") merged_file = util.new_file(merged_name, basedir=self.products_dir) yield sixteen.merge_otu_tables( self.otu_tables, name=merged_file ) meta_biom_name = util.addtag(merged_file, "meta") yield biom.add_metadata( merged_file, meta_biom_name, self._get_or_create_sample_metadata() ) self.merged_otu_tables.append(meta_biom_name) for otu_table in self.merged_otu_tables: barchart_path = util.new_file( otu_table+"_barcharts", basedir=self.products_dir) yield visualization.stacked_bar_chart( otu_table, barchart_path, **self.options.get('stacked_bar_chart', {})) tsv_filename = otu_table+".tsv" yield association.biom_to_tsv(otu_table, tsv_filename) nice_tsv_filename = util.addtag(tsv_filename, 'maaslin') yield association.qiime_to_maaslin(tsv_filename, nice_tsv_filename) pcl_filename = otu_table+".pcl" yield association.merge_otu_metadata( nice_tsv_filename, self._get_or_create_sample_metadata(), pcl_filename ) self.pcl_files.append(pcl_filename) for pcl_file in self.pcl_files: yield visualization.breadcrumbs_pcoa_plot( pcl_file, pcl_file+"_pcoa_plot.png", CoordinatesMatrix = pcl_file+"_pcoa_coords.txt", **self.options.get('breadcrumbs_pcoa_plot', {}) )
def maybe_stitch(maybe_pairs, products_dir, barcode_files=list(), drop_unpaired=False): pairs, singles = split_pairs(maybe_pairs) tasks = list() barcodes = list() if not pairs: return singles, barcode_files, tasks pairs = sorted(pairs, key=firstitem) barcode_files = sorted(barcode_files) for pair, maybe_barcode in izip_longest(pairs, barcode_files): (forward, reverse), maybe_tasks = maybe_convert_to_fastq(pair, products_dir) tasks.extend(maybe_tasks) output = util.new_file(_to_merged(forward), basedir=products_dir) singles.append(output) if maybe_barcode and drop_unpaired: tasks.append( general.fastq_join(forward, reverse, output, options={'drop_unpaired': drop_unpaired})) filtered_barcode = util.new_file(util.addtag( maybe_barcode, "filtered"), basedir=products_dir) pairtask = general.sequence_pair(maybe_barcode, output, outfname1=filtered_barcode, options={"inner_join": "right"}) barcodes.append(filtered_barcode) tasks.append(pairtask) else: tasks.append( general.fastq_join(forward, reverse, output, maybe_barcode, {'drop_unpaired': drop_unpaired})) barcodes.append(maybe_barcode) return singles, barcodes, tasks
def stacked_bar_chart(biom_fname, output_dir, qiime_opts=dict()): """Workflow to produce stacked bar charts of biom-formatted taxonomic profiles using QIIME's `summarize_taxa_through_plots.py`. :param biom_fname: String; the file name of a single biom-formatted otu table or taxonomic profile to be visualized. :param output_dir: String; the full path to a directory wherein the summary plots and charts will be placed :keyword qiime_opts: Dictionary; A dictionary of command line options to be passed to the wrapped summarize_taxa_through_plots.py script. No - or -- flags are necessary; the correct - or --t flags are inferred based on the length of the option. For boolean options, use the key/value pattern of { "my-option": "" }. External dependencies - Qiime 1.8.0: https://github.com/qiime/qiime-deploy """ cmd = ("summarize_taxa_through_plots.py " "-i {} -o {} ".format(biom_fname, output_dir)) default_opts = {"force": True} default_opts.update(qiime_opts) opts = dict_to_cmd_opts(default_opts) cmd += opts target = os.path.join(output_dir, addtag(os.path.basename(biom_fname), "L2")) yield { "name": "stacked_bar_chart: " + output_dir, "actions": [cmd], "file_dep": [biom_fname], "targets": [target] }
def _drop_unknown(): import os import gzip import json from biom.table import DenseOTUTable from biom.parse import ( OBS_META_TYPES, parse_biom_table, parse_classic_table_to_rich_table ) idx = set([ row.strip().split('\t')[0] for row in gzip.open(_copy_fname) ]) filter_func = lambda a, otu_id, c: str(otu_id) in idx tmpfile = file+"_tmp.biom" with open(file) as f, open(tmpfile, 'w') as f_out: try: table = parse_biom_table(f) except Exception as e: table = parse_classic_table_to_rich_table( f, None, None, OBS_META_TYPES['taxonomy'], DenseOTUTable) table = table.filterObservations(filter_func) json.dump( table.getBiomFormatObject("AnADAMA"), f_out ) os.rename(file, addtag(file, "unfiltered")) os.rename(tmpfile, file)
def _join(s): file, ext = os.path.splitext(seqfile_in) file = file + '.' + suffix return new_file(addtag(file, s), basedir=default_opts['output'])
def picrust(file, output_dir=None, verbose=True, **opts): """Workflow to predict metagenome functional content from 16S OTU tables. :param file: String; input OTU table. :keyword tab_in: Boolean; True if the input is a tabulated file (default:0) :keyword tab_out: Boolean; True if the output file is to be tabulated (default:False) :keyword gg_version: String; the greengenes version to be used (default:most recent version) :keyword t: String; option to use a different type of prediction (default:KO) :keyword with_confidence: Boolean; Set to True to output confidence intervals (default:0) :keyword custom: String; specify a file containing a custom trait to predict metagenomes External Dependencies: - PICRUSt: Version 1.0.0, http://picrust.github.io/picrust/install.html#install """ norm_out = new_file(addtag(file, "normalized_otus"), basedir=output_dir) predict_out = new_file(addtag(file, "picrust"), basedir=output_dir) all_opts = { 'tab_in' : 0, 'tab_out' : 0, 'gg_version' : '', 't' : '', 'with_confidence' : 0, 'custom' : '', 'drop_unknown' : True} all_opts.update(opts) drop_unknown = all_opts.pop("drop_unknown", True) _copy_fname = settings.workflows.picrust.copy_number def _drop_unknown(): import os import gzip import json from biom.table import DenseOTUTable from biom.parse import ( OBS_META_TYPES, parse_biom_table, parse_classic_table_to_rich_table ) idx = set([ row.strip().split('\t')[0] for row in gzip.open(_copy_fname) ]) filter_func = lambda a, otu_id, c: str(otu_id) in idx tmpfile = file+"_tmp.biom" with open(file) as f, open(tmpfile, 'w') as f_out: try: table = parse_biom_table(f) except Exception as e: table = parse_classic_table_to_rich_table( f, None, None, OBS_META_TYPES['taxonomy'], DenseOTUTable) table = table.filterObservations(filter_func) json.dump( table.getBiomFormatObject("AnADAMA"), f_out ) os.rename(file, addtag(file, "unfiltered")) os.rename(tmpfile, file) cmd1 = ("normalize_by_copy_number.py " + "-i %s" + " -o " + norm_out) if all_opts['gg_version']: cmd1 += " -g " + all_opts['gg_version'] if all_opts['tab_in']: cmd1 += " -f" cmd2 = ("predict_metagenomes.py " + "-i %s" + " -o " + predict_out) if all_opts['gg_version']: cmd2 += " -g " + all_opts['gg_version'] if all_opts['tab_out']: cmd2 += " -f" if all_opts['t']: cmd2 += " -t " + all_opts['t'] if all_opts['with_confidence']: cmd2 += " --with_confidence" if all_opts['custom']: cmd2 += " -c " + all_opts['custom'] converted = addtag(file, "json") format_cmd = CmdAction('biom convert --table-type="OTU table"' ' --header-key taxonomy --to-json' ' -i {} -o {} '.format(file, converted), verbose=verbose) def run(targets): # try to run without converting to json, if that fails, # convert first, then run on the json-converted biom file if os.stat(file).st_size < 1: for target in targets: open(target, "w").close() return True return strategies.backup( (strategies.Group(CmdAction(cmd1%(file), verbose=verbose), CmdAction(cmd2%(norm_out), verbose=verbose)), strategies.Group(format_cmd, CmdAction(cmd1%(converted), verbose=verbose), CmdAction(cmd2%(norm_out), verbose=verbose))) ) actions = [run] if drop_unknown: actions = [_drop_unknown, run] def _rusage(task): msg = task.name+" Estimated mem={mem} time={time} threads=1" s = os.stat(list(task.file_dep)[0]).st_size return msg.format( mem=100+(s/1024.), time=100+(s*2.5e-4) ) return dict( name = "picrust:"+predict_out, actions = actions, file_dep = [file], targets = [predict_out, norm_out], title = _rusage, )
def demultiplex_illumina(fastq_fnames, barcode_fnames, map_fname, output_fname, verbose=True, qiime_opts={}): output_dir, output_basename = os.path.split(output_fname) default_opts = { "i": ",".join(fastq_fnames), "b": ",".join(barcode_fnames), "m": map_fname, "o": output_dir } default_opts.update(qiime_opts) opts = dict_to_cmd_opts(default_opts) cmd = "split_libraries_fastq.py " revcomp_map_fname = new_file(addtag(map_fname, "revcomp"), basedir=output_dir) revcomp_opts = default_opts.copy() revcomp_opts['m'] = revcomp_map_fname revcomp_opts = dict_to_cmd_opts(revcomp_opts) def _revcomp(): from anadama.util import deserialize_map_file, serialize_map_file from Bio.Seq import Seq def _reverse(sample): seq = Seq(sample.BarcodeSequence).reverse_complement() return sample._replace(BarcodeSequence=str(seq)) with open(map_fname) as from_map: from_samples = deserialize_map_file(from_map) serialize_map_file( ( _reverse(s) for s in from_samples ), revcomp_map_fname ) default_out = os.path.join(output_dir, "seqs.fna") output_exists = lambda *args, **kwargs: ( not os.path.exists(default_out) or not os.stat(default_out).st_size > 1 ) def run(): return strategies.backup( (CmdAction(cmd+opts, verbose=verbose), strategies.Group( PythonAction(_revcomp), CmdAction(cmd+revcomp_opts,verbose=verbose))), extra_conditions=[output_exists] ) actions = [run] if output_basename != "seqs.fna": actions.append("mv '%s' '%s'"%(default_out, output_fname)) return { "name": "demultiplex_illumina:"+output_fname, "actions": actions, "file_dep": list(fastq_fnames) + list(barcode_fnames) + [map_fname], "targets": [output_fname], "title": lambda t: t.name+" Estimated time=%.2f"%( sum(os.stat(f).st_size for f in t.file_dep)/1024./1024/5) }