def family_clustered_suboptimals(rfid, plots = True, num = 5000, min_count = 2, n_countsorted = 10, n_esorted = 10, draw = False, cluster_type = 'just_list', savename = None): if savename == None: savename = rfid ali, tree, infos = rfam.get_fam(rfid) ali_ids = [a.name for a in ali] for i, n in enumerate(tree.get_terminals()): match = re.compile('_([^_]*)_').search(n.name) if not match or not '/' in match.group(1): this_seq = [] else: term_id = match.group(1) this_seq = ali[ali_ids.index(term_id)] n.m = {'seq':this_seq, 'probs':[1 for j in range(len(this_seq))]} big_refnode, big_refseq = \ subtree_refseq(tree) ungapped_ref = ungapped_seq(big_refseq, rfid) seq = ungapped_ref structs = suboptimals(ungapped_ref, sp_method = 'sample',name = rfid, n = num) stks = [pairs_stk(s,len(seq)) for s in structs] stk_srt = sorted([ (i,s) for i,s in enumerate(stks)], key = lambda x: x[1]) stk_groups = [ list(g) for k, g in it.groupby(stk_srt,key =lambda x: x[1])] stk_unq, struct_counts = zip(*[( g[0][0] , len(g)) for g in stk_groups]) structs = [structs[elt] for elt in stk_unq ] if cluster_type == 'full_clustering': final_structs, final_energies = select_exemplars_from_clustering(structs,struct_counts,seq, draw = draw) return elif cluster_type == 'just_list': final_structs, final_energies = select_exemplars_from_list(structs,struct_counts,seq, draw = draw) if draw: try: print 'DRAWING final subopts' verts = struct_verts(final_structs, seq, rfid ) show_subopts(final_structs, verts, final_energies) f = plt.gcf() f.savefig(cfg.dataPath('figs/RNAfoldz/exemplars_{0}.ps'.format(savename))) except Exception, e: print "EXCEPTION!" pass
def get_consensus(rfid = 'RF00', mweight = .5, refseq_method = 'root', sp_method = 'sample', aff_type = 'pairs', reset = True, do_plot = False, run_id = 'CONS_TEST'): ali, tree, infos = rfam.get_fam(rfid) ali_ids = [a.name for a in ali] for i, n in enumerate(tree.get_terminals()): term_id = re.compile('_([^_]*)_').search(n.name).group(1) this_seq = ali[ali_ids.index(term_id)] n.m = {'seq':this_seq, 'probs':[1 for j in range(len(this_seq))]} #if do_plot : rplots.plot_clusters(inds,{'pca embedding':pca_vecs},title = title,plot3d = True) big_refnode, big_refseq = \ subtree_refseq(tree, method = refseq_method) ungapped_ref = rutils.ungapped_seq(big_refseq, rfid) #pca_vecs,exemplar_structs = return family_exemplar_structs(rfid, sp_method = sp_method, refseq_method = refseq_method, aff_type = aff_type, ) struct_profiles = infernal.profiles(ungapped_ref,exemplar_structs, run_id) clades = split_tree(tree) all_vecs = {'all_time':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ], 'all_mut':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ], 'fiftyfifty':[ [ [] for i in range(len(struct_profiles))] for j in range(len(clades)) ]} aamuts, aatimes, aairr, aagaps = [], [], [], [] for idx_clade, c in enumerate(clades): if len(c.get_terminals()) < 3: print 'SKIPPPING CUZ SUBTREE TOO SMALL' continue c_ids = [ n.m['seq'].name for n in c.get_terminals() ] if len(nonzero(greater([len(list(g)) for k, g in it.groupby(sorted(c_ids))],1))[0])>0: print 'SKIPPING CUZ THERE ARE TWO COPIES OF SOME F*****G SEQUENCE IN TREE' continue all_muts, all_times , all_gaps, all_irr = [], [], [], [] print print 'Clade: {0}'.format(idx_clade) for idx_struct, struct_info in enumerate( zip( struct_profiles, exemplar_structs)): struct_profile, ex_struct = struct_info ngaps = 0 #OLD ALIGNMENTS calis = ba.MultipleSeqAlignment(\ [n.m['seq'] for n in c.get_terminals() ]) #NEW ALIGNMENTS AND REF STRUCTURE c_new_ali , stk, struct = infernal.alignment(calis, struct_profile, rfid) #REF STRUCTURE PAIRS pairs = rutils.stk_pairs(struct) if len(pairs) != len(ex_struct): raise Exception() cterms = c.get_terminals() for i2, ct in enumerate(cterms): lilid = 'N{0}'.format(i2) ct.name = lilid ct.m['str_seq'] = c_new_ali[i2] ct.m['str_seq'].id = lilid ct.m['probs'] = ones(len(c_new_ali[i2])) #BUILD A TREE tr = phy.BaseTree.Tree(c) #RUN PAML paml_run_id = 'ali_anc_c{0:04}_s{0:03}'.format(idx_clade,idx_struct) rstfile= paml.run_paml(tr, c_new_ali, run_id = paml_run_id) anc_tree = paml.rst_parser(rstfile) #Label extent and internal nodes with sequences. for term in anc_tree.get_terminals(): #Terminals have old (rfam) alis and new (infernal) alis term.m = filter( lambda x: x.name == term.name, cterms)[0].m for node in anc_tree.get_nonterminals(): #Internals only have new alis. m['seq'] = m['str_seq'] node.m['str_seq'] = node.m['seq'] node.m['str_seq'].seq = node.m['str_seq'].seq.replace('T', 'U') subtree = anc_tree #Evaluate all of the structs on the first pass #to have access to mean frequencies of different #mutational types in the final score computation refnode, refseq = subtree_refseq(subtree, method = refseq_method) muts, times, gaps, irresolvables = subtree_count_struct(subtree, pairs) all_muts.append(muts) all_times.append(times) all_gaps.append(gaps) all_irr.append(irresolvables) compute_signatures(all_vecs,idx_clade, all_muts,all_times, exemplar_structs,ungapped_ref ) aamuts.append(all_muts) aatimes.append(all_times) aairr.append(all_irr) aagaps.append(all_gaps) outputs = { 'all_vecs':all_vecs, 'all_muts':aamuts, 'all_times':aatimes, 'exemplar_structs':exemplar_structs, 'reference_seq':ungapped_ref, 'thermo_ex_inds':inds, 'thermo_embedding':pca_vecs, 'title':title, 'thermo_aff_type':aff_type, 'tree':tree, 'run_id':run_id } pickle.dump(outputs, open(cfg.dataPath('cs874/runs/{0}.pickle'.format(run_id)),'w')) return(outputs)
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True, draw_distances = draw_all_easy, draw_clusters = draw_all_easy, draw_single_cluster = draw_all_hard): ''' Run the tree computation for each clsuter in the rfam family. (Or just one) 1) Compute clusters using a distance measure derived either phyml or a simple levenshtein dist. kwds: tree [True] Use a tree or just a levenshtein distance to get distances for init clustering. 2) Choose a cluster of well related sequences and for this this cluster, compute an alignment (For each structure using phase or for sequences using MUSCLE) kwds: struct_align [True] Whether to compute structural alignments or use MUSCLE ''' rutils = utils ali, tree, infos = rfam.get_fam(rfid) n = len(ali) if draw_distances: dists_t = seq_dists(ali,rfid, tree = True) dists_l = seq_dists(ali,rfid, tree = False) dtf = dists_t.flatten() dlf = dists_l.flatten() lin = linregress(dtf, dlf) rsquared = lin[2]**2 f = myplots.fignum(5, (7,7)) ax = f.add_subplot(111) ax.annotate('Levenshtein distance vs. BioNJ branch lengths', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('R-Squared: {0}'.format(rsquared), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('BIONJ Tree ML Distance') ax.set_ylabel('Levenshtein Distance') ax.scatter(dtf, dlf, 100) datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff') f.savefig(datafile) dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid, register = rfid, on_fail = 'compute', reset = reset) clusters = maxclust_dists(dists, k = 5, method = 'complete') clusters -= 1 if draw_clusters: ct = mycolors.getct(len(set(clusters))) colors = [ct[elt] for elt in clusters] pca_vecs = mlab.PCA(dists).project(dists) f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of Clusters: {0}'.format(len(ct)), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 1') ax.set_ylabel('PC 2') ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors) datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps') f.savefig(datafile) #now take the largest cluster and do the analysis. cgrps = dict([ (k, list(g)) for k , g in it.groupby(\ sorted( list(enumerate(clusters)),key = lambda x: x[1]), key = lambda x: x[1])]) cbig = argmax([len(x) for x in cgrps.values()]) cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] csize = len(cluster_seqs) seqs =[ali[c] for c in cluster_seqs] if 0: ct = mycolors.getct(2) pca_vecs = mlab.PCA(dists).project(dists) colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n - csize), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 0') ax.set_ylabel('Distance') for s in cluster_seqs: ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2), color = colors, alpha = .2) datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps') f.savefig(datafile) clusters_final = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))] seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final] return seqs_final