def build_index_refpkg(hrefpkg_paths, sequence_file, seqinfo, taxonomy, dest='index.refpkg', **meta): """ Build an index.refpkg from a set of hrefpkgs """ # Clear taxonomy taxonomy = copy.deepcopy(taxonomy) for node in taxonomy: node.sequence_ids = set() def sequence_names(f): with open(f) as fp: r = csv.DictReader(fp) for i in r: yield i['seqname'] hrefpkgs = (Refpkg(i, create=False) for i in hrefpkg_paths) seqinfo_files = (i.open_resource('seq_info') for i in hrefpkgs) # Add seqinfo for f in seqinfo_files: with f: taxonomy.populate_from_seqinfo(f) # Remove lineages without sequences taxonomy.prune_unrepresented() sequence_ids = frozenset(taxonomy.subtree_sequence_ids()) with util.ntf(prefix='aln_fasta', suffix='.fasta') as tf, \ util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp, \ util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp: wrap.esl_sfetch(sequence_file, sequence_ids, tf) tf.close() # Seqinfo file r = (i for i in seqinfo if i['seqname'] in sequence_ids) w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) w.writeheader() w.writerows(r) seq_info_fp.close() taxonomy.write_taxtable(tax_fp) tax_fp.close() rp = Refpkg(dest, create=True) rp.start_transaction() rp.update_file('aln_fasta', tf.name) rp.update_file('seq_info', seq_info_fp.name) rp.update_file('taxonomy', tax_fp.name) rp.update_file('profile', wrap.CM) for k, v in meta.items(): rp.update_metadata(k, v) rp.commit_transaction() return rp, sequence_ids
def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS): """Context manager yielding a temporary reference package for a collection of aligned sequences. Builds a tree with FastTree, creates a reference package, yields. """ sequences = list(sequences) with ntf(prefix='fasttree-', suffix='.log') as log_fp, \ ntf(prefix='fasttree-', suffix='.tre') as tree_fp, \ tempdir(prefix='refpkg') as refpkg_dir: log_fp.close() fasttree(sequences, log_path=log_fp.name, output_fp=tree_fp, gtr=True, threads=threads) tree_fp.close() rp = Refpkg(refpkg_dir(name), create=True) rp.update_metadata('locus', '') rp.update_phylo_model('FastTree', log_fp.name) rp.update_file('tree', tree_fp.name) # FASTA and Stockholm alignment with ntf(suffix='.fasta') as f: SeqIO.write(sequences, f, 'fasta') f.close() rp.update_file('aln_fasta', f.name) with ntf(suffix='.sto') as f: SeqIO.write(sequences, f, 'stockholm') f.close() rp.update_file('aln_sto', f.name) logging.debug("Reference package written to %s", rp.path) yield rp
def tax_id_refpkg(tax_id, full_tax, seqinfo, sequence_file, output_dir='.', index_rank='order', train_file=None, test_file=None): """ Build a reference package containing all descendants of tax_id from an index reference package. """ with util.ntf(prefix='taxonomy', suffix='.csv') as tax_fp, \ util.ntf(prefix='aln_sto', suffix='.sto') as sto_fp, \ util.ntf(prefix='aln_fasta', suffix='.fasta') as fasta_fp, \ util.ntf(prefix='tree', suffix='.tre') as tree_fp, \ util.ntf(prefix='tree', suffix='.stats') as stats_fp, \ util.ntf(prefix='seq_info', suffix='.csv') as seq_info_fp: # Subset taxonomy n = full_tax.get_node(tax_id) descendants = set(i.tax_id for i in n) assert descendants n.write_taxtable(tax_fp) tax_fp.close() # Subset seq_info w = csv.DictWriter(seq_info_fp, seqinfo[0].keys(), quoting=csv.QUOTE_NONNUMERIC) w.writeheader() rows = [i for i in seqinfo if i['tax_id'] in descendants] sinfo = {i['seqname']: i for i in rows} # Choose sequences, divide into train and test sets chosen = choose_sequence_ids(n, rows, index_rank=index_rank) keep_seq_ids = set() train_seq_ids = set() test_seq_ids = set() for keep, rest in chosen: keep_seq_ids |= frozenset(keep) l = len(rest) if l >= 2 * PER_TAXON: train_seq_ids |= frozenset(rest[:l / 2]) test_seq_ids |= frozenset(rest[l / 2:]) # Picked rows = [sinfo[i] for i in keep_seq_ids] w.writerows(rows) seq_info_fp.close() # Fetch sequences with tempfile.NamedTemporaryFile() as tf: wrap.esl_sfetch(sequence_file, keep_seq_ids, tf) # Rewind tf.seek(0) sequences = list(SeqIO.parse(tf, 'fasta')) logging.info("Tax id %s: %d sequences", tax_id, len(sequences)) if len(set(str(i.seq) for i in sequences)) == 1: logging.warn("Skipping %s: only 1 unique sequence string", tax_id) return None # No sense in building with one sequence if len(sequences) < 2: logging.warn("Skipping: %d sequences.", len(sequences)) return None # Extract training & test seqs if train_file: logging.info("%d training sequences", len(train_seq_ids)) wrap.esl_sfetch(sequence_file, train_seq_ids, train_file) if test_file: logging.info("%d test sequences", len(test_seq_ids)) wrap.esl_sfetch(sequence_file, test_seq_ids, test_file) # Cmalign aligned = wrap.cmalign(sequences, output=sto_fp) aligned = list(aligned) assert aligned # Tree wrap.fasttree(aligned, log_path=stats_fp.name, output_fp=tree_fp, threads=1, gtr=True) tree_fp.close() sto_fp.close() SeqIO.write(aligned, fasta_fp, 'fasta') fasta_fp.close() rp = Refpkg(os.path.join(output_dir, tax_id + '.refpkg'), create=True) rp.start_transaction() rp.update_file('aln_sto', sto_fp.name) rp.update_file('aln_fasta', fasta_fp.name) rp.update_file('tree', tree_fp.name) rp.update_file('seq_info', seq_info_fp.name) rp.update_file('taxonomy', tax_fp.name) try: rp.update_phylo_model('FastTree', stats_fp.name) except: print >> sys.stderr, stats_fp.read() raise rp.update_file('profile', wrap.CM) rp.commit_transaction() util.require_executable('rppr') rp.reroot() return rp.path