def action(args): rp = Refpkg(args.refpkg) rp.load_db() cursor = rp.db.cursor() ranks = args.ranks.split(',') with tempfile.NamedTemporaryFile() as tmp_db: taxtable_db = Taxdb(sqlite3.connect(tmp_db.name)) taxtable_db.create_tables() reader = csv.DictReader(args.infile) taxtable_db.insert_from_taxtable(lambda: reader._fieldnames, reader) cursor.execute('ATTACH DATABASE ? AS tt', (tmp_db.name,)) writer = csv.writer(args.outfile) writer.writerow(('tax_id', 'intersection_rank')) cursor.execute(""" SELECT tax_id, COALESCE(itaxa.rank, "") FROM tt.taxa LEFT JOIN (SELECT child AS tax_id, rank FROM tt.parents JOIN taxa ON tax_id = parent JOIN ranks USING (rank) WHERE rank IN (%s) ORDER BY child, rank_order ASC) itaxa USING (tax_id) GROUP BY tax_id """ % ', '.join('?' * len(ranks)), ranks) writer.writerows(cursor) args.outfile.flush() test_output(args.infile.name, args.outfile.name, ranks)
def align_and_merge(env, refpkg, qseqs, outdir = None, options = None, nproc = 1): """ Align sequences in ``qseqs`` and merge with the reference alignment. * env - Environment instance. * refpkg - path to a reference package directory. * qseqs - unaligned query sequenecs in fasta format. * outdir - optional output directory; saves files to same directory as qseqs if unspecified. * options - flags for cmalign [default infernal.CMALIGN_FLAGS] * nproc - number of processors to use for ``cmalign``. Returns (sto, scores, merged) Example:: from bioscons.pplacer import align_and_merge env.AddMethod(align_and_merge, "align_and_merge") sto, scores, merged = env.align_and_merge( refpkg = 'my.refpkg', qseqs = 'myseqs.fasta' ) """ if not hasattr(env, 'cmalign_method'): env.AddMethod(cmalign_method, 'cmalign_method') if not hasattr(env, 'cmmerge_method'): env.AddMethod(cmmerge_method, 'cmmerge_method') pkg = Refpkg(refpkg, create=False) profile = pkg.file_abspath('profile') ref_sto = pkg.file_abspath('aln_sto') # align sequences sto, scores = env.cmalign_method( profile = profile, fasta = qseqs, nproc = nproc, options = options or CMALIGN_FLAGS, outdir = outdir ) # merge with reference set merged = env.cmmerge_method( profile, ref_sto, sto, outname = rename(sto, '_merged.sto'), options = options or CMALIGN_FLAGS, outdir = outdir ) if outdir and not outdir == '.': Clean(merged, Dir(outdir)) return Flatten([sto, scores, merged])
def action(args): rp = Refpkg(args.refpkg, create=False) rp.load_db() cursor = rp.db.cursor() ranks = args.ranks.split(',') with tempfile.NamedTemporaryFile() as tmp_db: taxtable_db = Taxdb(sqlite3.connect(tmp_db.name)) taxtable_db.create_tables() reader = csv.DictReader(args.infile) taxtable_db.insert_from_taxtable(lambda: reader._fieldnames, reader) cursor.execute('ATTACH DATABASE ? AS tt', (tmp_db.name, )) writer = csv.writer(args.out) writer.writerow(('tax_id', 'intersection_rank')) cursor.execute( """ SELECT tax_id, COALESCE(itaxa.rank, "") FROM tt.taxa LEFT JOIN (SELECT child AS tax_id, rank_order, rank FROM tt.parents JOIN taxa ON tax_id = parent JOIN ranks USING (rank) WHERE rank IN (%s)) itaxa USING (tax_id) ORDER BY tax_id, rank_order DESC """ % ', '.join('?' * len(ranks)), ranks) if not args.all_ranks: cursor = filter_ranks(cursor) writer.writerows(cursor) args.out.flush() test_output(args.infile.name, args.out.name, ranks)
def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS): """Context manager yielding a temporary reference package for a collection of aligned sequences. Builds a tree with FastTree, creates a reference package, yields. """ sequences = list(sequences) with ntf(prefix='fasttree-', suffix='.log') as log_fp, \ ntf(prefix='fasttree-', suffix='.tre') as tree_fp, \ tempdir(prefix='refpkg') as refpkg_dir: log_fp.close() fasttree(sequences, log_path=log_fp.name, output_fp=tree_fp, gtr=True, threads=threads) tree_fp.close() rp = Refpkg(refpkg_dir(name), create=True) rp.update_metadata('locus', '') rp.update_phylo_model('FastTree', log_fp.name) rp.update_file('tree', tree_fp.name) # FASTA and Stockholm alignment with ntf(suffix='.fasta') as f: SeqIO.write(sequences, f, 'fasta') f.close() rp.update_file('aln_fasta', f.name) with ntf(suffix='.sto') as f: SeqIO.write(sequences, f, 'stockholm') f.close() rp.update_file('aln_sto', f.name) logging.debug("Reference package written to %s", rp.path) yield rp
workdir = args.workdir try: os.makedirs(workdir) except OSError, e: if e.errno != errno.EEXIST: raise if not args.disable_cleanup: @atexit.register def cleanup_workdir(): shutil.rmtree(workdir, ignore_errors=True) classif_db = os.path.join(workdir, 'classifications.sqlite') index_refpkg = os.path.join(args.hrefpkg, 'index.refpkg') index = Refpkg(index_refpkg) index_rank = index.metadata('index_rank') classif_rank = args.classification_rank or index_rank index_counts = os.path.join(args.hrefpkg, 'index-%s.counts' % (classif_rank, )) log.info('performing initial classification at %s', classif_rank) silently_unlink(classif_db) logging_check_call( [args.rppr, 'prep_db', '--sqlite', classif_db, '-c', index_refpkg]) logging_check_call([ args.guppy, 'classify', '--sqlite', classif_db, '-c', index_refpkg, '--classifier', 'nbc', '--nbc-rank', classif_rank, '--no-pre-mask', '--nbc-sequences', args.query_seqs, '--nbc-counts', index_counts, '-j', str(args.ncores) ])
def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file, output_dir='.', index_rank='order', train_file=None, test_file=None): """ Build a reference package containing all descendants of tax_id from an index reference package. """ with util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp, \ util.ntf(prefix='aln_sto', suffix='.sto') as sto_fp, \ util.ntf(prefix='aln_fasta', suffix='.fasta') as fasta_fp, \ util.ntf(prefix='tree', suffix='.tre') as tree_fp, \ util.ntf(prefix='tree', suffix='.stats') as stats_fp, \ util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp: # Subset taxonomy n = full_tax.get_node(tax_id) descendants = set(i.tax_id for i in n) assert descendants n.write_taxtable(tax_fp) tax_fp.close() # Subset seq_info w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), quoting=csv.QUOTE_NONNUMERIC) w.writeheader() rows = [i for i in seqinfo if i['tax_id'] in descendants] sinfo = {i['seqname']: i for i in rows} # Choose sequences, divide into train and test sets chosen = choose_sequence_ids(n, rows, index_rank=index_rank) keep_seq_ids = set() train_seq_ids = set() test_seq_ids = set() for keep, rest in chosen: keep_seq_ids |= frozenset(keep) l = len(rest) if l >= 2 * PER_TAXON: train_seq_ids |= frozenset(rest[:l / 2]) test_seq_ids |= frozenset(rest[l / 2:]) # Picked rows = [sinfo[i] for i in keep_seq_ids] w.writerows(rows) seq_info_fp.close() # Fetch sequences with tempfile.NamedTemporaryFile() as tf: wrap.esl_sfetch(sequence_file, keep_seq_ids, tf) # Rewind tf.seek(0) sequences = list(SeqIO.parse(tf, 'fasta')) logging.info("Tax id %s: %d sequences", tax_id, len(sequences)) if len(set(str(i.seq) for i in sequences)) == 1: logging.warn("Skipping %s: only 1 unique sequence string", tax_id) return None # No sense in building with one sequence if len(sequences) < 2: logging.warn("Skipping: %d sequences.", len(sequences)) return None # Extract training & test seqs if train_file: logging.info("%d training sequences", len(train_seq_ids)) wrap.esl_sfetch(sequence_file, train_seq_ids, train_file) if test_file: logging.info("%d test sequences", len(test_seq_ids)) wrap.esl_sfetch(sequence_file, test_seq_ids, test_file) # Cmalign aligned = wrap.cmalign(sequences, output=sto_fp) aligned = list(aligned) assert aligned # Tree wrap.fasttree(aligned, log_path=stats_fp.name, output_fp=tree_fp, threads=1, gtr=True) tree_fp.close() sto_fp.close() SeqIO.write(aligned, fasta_fp, 'fasta') fasta_fp.close() rp = Refpkg(os.path.join(output_dir, tax_id + '.refpkg'), create=True) rp.start_transaction() rp.update_file('aln_sto', sto_fp.name) rp.update_file('aln_fasta', fasta_fp.name) rp.update_file('tree', tree_fp.name) rp.update_file('seq_info', seq_info_fp.name) rp.update_file('taxonomy', tax_fp.name) try: rp.update_phylo_model('FastTree', stats_fp.name) except: print >> sys.stderr, stats_fp.read() raise rp.update_file('profile', wrap.CM) rp.commit_transaction() util.require_executable('rppr') rp.reroot() return rp.path
def build_index_refpkg(hrefpkg_paths, sequence_file, seqinfo, taxonomy, dest='index.refpkg', **meta): """ Build an index.refpkg from a set of hrefpkgs """ # Clear taxonomy taxonomy = copy.deepcopy(taxonomy) for node in taxonomy: node.sequence_ids = set() def sequence_names(f): with open(f) as fp: r = csv.DictReader(fp) for i in r: yield i['seqname'] hrefpkgs = (Refpkg(i, create=False) for i in hrefpkg_paths) seqinfo_files = (i.open_resource('seq_info') for i in hrefpkgs) # Add seqinfo for f in seqinfo_files: with f: taxonomy.populate_from_seqinfo(f) # Remove lineages without sequences taxonomy.prune_unrepresented() sequence_ids = frozenset(taxonomy.subtree_sequence_ids()) with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \ util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \ util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp: wrap.esl_sfetch(sequence_file, sequence_ids, tf) tf.close() # Seqinfo file r = (i for i in seqinfo if i['seqname'] in sequence_ids) w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) w.writeheader() w.writerows(r) seq_info_fp.close() taxonomy.write_taxtable(tax_fp) tax_fp.close() rp = Refpkg(dest, create=True) rp.start_transaction() rp.update_file('aln_fasta', tf.name) rp.update_file('seq_info', seq_info_fp.name) rp.update_file('taxonomy', tax_fp.name) rp.update_file('profile', wrap.CM) for k, v in meta.items(): rp.update_metadata(k, v) rp.commit_transaction() return rp, sequence_ids
else: workdir = args.workdir try: os.makedirs(workdir) except OSError, e: if e.errno != errno.EEXIST: raise if not args.disable_cleanup: @atexit.register def cleanup_workdir(): shutil.rmtree(workdir, ignore_errors=True) classif_db = os.path.join(workdir, 'classifications.sqlite') index_refpkg = os.path.join(args.hrefpkg, 'index.refpkg') index = Refpkg(index_refpkg) index_rank = index.metadata('index_rank') classif_rank = args.classification_rank or index_rank index_counts = os.path.join(args.hrefpkg, 'index-%s.counts' % (classif_rank,)) log.info('performing initial classification at %s', classif_rank) silently_unlink(classif_db) logging_check_call( [args.rppr, 'prep_db', '--sqlite', classif_db, '-c', index_refpkg]) logging_check_call( [args.guppy, 'classify', '--sqlite', classif_db, '-c', index_refpkg, '--classifier', 'nbc', '--nbc-rank', classif_rank, '--no-pre-mask', '--nbc-sequences', args.query_seqs, '--nbc-counts', index_counts, '-j', str(args.ncores)]) with open(os.path.join(args.hrefpkg, 'index.csv'), 'rU') as fobj: refpkg_map = dict(csv.reader(fobj))