def run_anc(input_dict,run_id = None): assert run_id rank_name = input_dict['rank_name'] taxon_id = input_dict['taxid'] aliname = input_dict['aliname'] BT = getBTOL() p_node = ncbi.get_node(taxon_id) seqnodes = BT.investigatePhylum(p_node = p_node) recs, seqelts, seqtuples = seq_recs(seqnodes) align = align_seqnodes(recs) tree = phyml.tree(align, run_id = run_id) rstfile= paml.run_paml(tree, align, run_id = run_id) anc_tree = paml.rst_parser(rstfile) anc_alignment = [SeqRecord(elt.m['seq'], id = None, name = elt.name, annotations = {'scores':elt.m['probs']}) for elt in anc_tree.get_nonterminals()] out_dict = dict(anc_tree=anc_tree, anc_align= anc_alignment, term_tree = tree, term_align = align) return out_dict
def run(**kwargs): BT = getBTOL(**mem.sr(kwargs)) seqnodes = BT.investigatePhylum(**kwargs) recs, seqelts, seqtuples = seq_recs(seqnodes) align = align_seqnodes(recs) tree = phyml.tree(align) rstfile= paml.run_paml(tree, align) anc_tree = paml.rst_parser(rstfile) anc_alignment = [SeqRecord(elt.m['seq'], id = None, name = elt.name, annotations = {'scores':elt.m['probs']}) for elt in anc_tree.get_nonterminals()] return (tree, anc_tree), (align, anc_alignment)
def get_ml_ancestor_tree(tree, ali, run_id): print 'Running ancestor inference in PAML' #RUN PAML rstfile= paml.run_paml(tree, ali, run_id = run_id) anc_tree = paml.rst_parser(rstfile) #Label extent and internal nodes with sequences. for term in anc_tree.get_terminals(): #COPY OLD TERMINAL NODES TO THE NEW TREE term.m = filter( lambda x: x.name == term.name, tree.get_terminals())[0].m for node in anc_tree.get_nonterminals(): #REPLACE Ts WITH US IN THE TERMINALS OF THE NEW TREE node.m['seq'].seq = node.m['seq'].seq.replace('T', 'U') return anc_tree
def get_consensus(rfid = 'RF00', mweight = .5, refseq_method = 'root', sp_method = 'sample', aff_type = 'pairs', reset = True, do_plot = False, run_id = 'CONS_TEST'): ali, tree, infos = rfam.get_fam(rfid) ali_ids = [a.name for a in ali] for i, n in enumerate(tree.get_terminals()): term_id = re.compile('_([^_]*)_').search(n.name).group(1) this_seq = ali[ali_ids.index(term_id)] n.m = {'seq':this_seq, 'probs':[1 for j in range(len(this_seq))]} #if do_plot : rplots.plot_clusters(inds,{'pca embedding':pca_vecs},title = title,plot3d = True) big_refnode, big_refseq = \ subtree_refseq(tree, method = refseq_method) ungapped_ref = rutils.ungapped_seq(big_refseq, rfid) #pca_vecs,exemplar_structs = return family_exemplar_structs(rfid, sp_method = sp_method, refseq_method = refseq_method, aff_type = aff_type, ) struct_profiles = infernal.profiles(ungapped_ref,exemplar_structs, run_id) clades = split_tree(tree) all_vecs = {'all_time':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ], 'all_mut':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ], 'fiftyfifty':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ]} aamuts, aatimes, aairr, aagaps = [], [], [], [] for idx_clade, c in enumerate(clades): if len(c.get_terminals()) < 3: print 'SKIPPPING CUZ SUBTREE TOO SMALL' continue c_ids = [ n.m['seq'].name for n in c.get_terminals() ] if len(nonzero(greater([len(list(g)) for k, g in it.groupby(sorted(c_ids))],1))[0])>0: print 'SKIPPING CUZ THERE ARE TWO COPIES OF SOME F*****G SEQUENCE IN TREE' continue all_muts, all_times , all_gaps, all_irr = [], [], [], [] print print 'Clade: {0}'.format(idx_clade) for idx_struct, struct_info in enumerate( zip( struct_profiles, exemplar_structs)): struct_profile, ex_struct = struct_info ngaps = 0 #OLD ALIGNMENTS calis = ba.MultipleSeqAlignment(\ [n.m['seq'] for n in c.get_terminals() ]) #NEW ALIGNMENTS AND REF STRUCTURE c_new_ali , stk, struct = infernal.alignment(calis, struct_profile, rfid) #REF STRUCTURE PAIRS pairs = rutils.stk_pairs(struct) if len(pairs) != len(ex_struct): raise Exception() cterms = c.get_terminals() for i2, ct in enumerate(cterms): lilid = 'N{0}'.format(i2) ct.name = lilid ct.m['str_seq'] = c_new_ali[i2] ct.m['str_seq'].id = lilid ct.m['probs'] = ones(len(c_new_ali[i2])) #BUILD A TREE tr = phy.BaseTree.Tree(c) #RUN PAML paml_run_id = 'ali_anc_c{0:04}_s{0:03}'.format(idx_clade,idx_struct) rstfile= paml.run_paml(tr, c_new_ali, run_id = paml_run_id) anc_tree = paml.rst_parser(rstfile) #Label extent and internal nodes with sequences. for term in anc_tree.get_terminals(): #Terminals have old (rfam) alis and new (infernal) alis term.m = filter( lambda x: x.name == term.name, cterms)[0].m for node in anc_tree.get_nonterminals(): #Internals only have new alis. m['seq'] = m['str_seq'] node.m['str_seq'] = node.m['seq'] node.m['str_seq'].seq = node.m['str_seq'].seq.replace('T', 'U') subtree = anc_tree #Evaluate all of the structs on the first pass #to have access to mean frequencies of different #mutational types in the final score computation refnode, refseq = subtree_refseq(subtree, method = refseq_method) muts, times, gaps, irresolvables = subtree_count_struct(subtree, pairs) all_muts.append(muts) all_times.append(times) all_gaps.append(gaps) all_irr.append(irresolvables) compute_signatures(all_vecs,idx_clade, all_muts,all_times, exemplar_structs,ungapped_ref ) aamuts.append(all_muts) aatimes.append(all_times) aairr.append(all_irr) aagaps.append(all_gaps) outputs = { 'all_vecs':all_vecs, 'all_muts':aamuts, 'all_times':aatimes, 'exemplar_structs':exemplar_structs, 'reference_seq':ungapped_ref, 'thermo_ex_inds':inds, 'thermo_embedding':pca_vecs, 'title':title, 'thermo_aff_type':aff_type, 'tree':tree, 'run_id':run_id } pickle.dump(outputs, open(cfg.dataPath('cs874/runs/{0}.pickle'.format(run_id)),'w')) return(outputs)