Esempio n. 1
0
def family_clustered_suboptimals(rfid, plots = True, num = 5000, min_count = 2,
                                 n_countsorted = 10, n_esorted = 10, 
                                 draw = False, cluster_type = 'just_list',
                                 savename = None):
    if savename == None:
        savename = rfid
    ali, tree, infos = rfam.get_fam(rfid)
    ali_ids = [a.name for a in ali]

    for i, n in enumerate(tree.get_terminals()):
        match = re.compile('_([^_]*)_').search(n.name) 
        if not match or not '/' in match.group(1):
            this_seq = []
        else:
            term_id = match.group(1)
            this_seq = ali[ali_ids.index(term_id)]
        n.m = {'seq':this_seq,
               'probs':[1 for j in range(len(this_seq))]}

    big_refnode, big_refseq = \
        subtree_refseq(tree)
    ungapped_ref = ungapped_seq(big_refseq, rfid)
    seq = ungapped_ref
    structs = suboptimals(ungapped_ref, sp_method = 'sample',name = rfid, n = num)

    stks = [pairs_stk(s,len(seq)) for s in structs]
    stk_srt = sorted([ (i,s) for i,s in enumerate(stks)], key = lambda x: x[1])
    stk_groups = [ list(g) for k, g in it.groupby(stk_srt,key =lambda x: x[1])]
    stk_unq, struct_counts = zip(*[( g[0][0] , len(g))  for g in stk_groups])
    structs  = [structs[elt] for elt in stk_unq ]
 
   
    if cluster_type == 'full_clustering':
        final_structs, final_energies = select_exemplars_from_clustering(structs,struct_counts,seq, draw = draw)
        return
    elif cluster_type == 'just_list':
        final_structs, final_energies = select_exemplars_from_list(structs,struct_counts,seq, draw = draw)

    if draw:
        try:
            print 'DRAWING final subopts' 
            verts = struct_verts(final_structs, seq, rfid )
            show_subopts(final_structs, verts, final_energies)
            f = plt.gcf()
            f.savefig(cfg.dataPath('figs/RNAfoldz/exemplars_{0}.ps'.format(savename)))
        except Exception, e:
            print "EXCEPTION!"
            pass
Esempio n. 2
0
def get_consensus(rfid = 'RF00', mweight = .5, 
                  refseq_method = 'root', sp_method = 'sample',
                  aff_type = 'pairs',  reset = True,
                  do_plot = False,  run_id = 'CONS_TEST'):

    ali, tree, infos = rfam.get_fam(rfid)
    ali_ids = [a.name for a in ali]

    for i, n in enumerate(tree.get_terminals()):
        term_id = re.compile('_([^_]*)_').search(n.name).group(1) 
        this_seq = ali[ali_ids.index(term_id)]
        n.m = {'seq':this_seq,
               'probs':[1 for j in range(len(this_seq))]}

    #if do_plot : rplots.plot_clusters(inds,{'pca embedding':pca_vecs},title = title,plot3d = True)
    

    big_refnode, big_refseq = \
        subtree_refseq(tree, method = refseq_method)
    ungapped_ref = rutils.ungapped_seq(big_refseq, rfid)
    #pca_vecs,exemplar_structs =
    return family_exemplar_structs(rfid,
                                   sp_method = sp_method,
                                   refseq_method = refseq_method,
                                   aff_type = aff_type,
                                   )
    struct_profiles = infernal.profiles(ungapped_ref,exemplar_structs, run_id)

    clades = split_tree(tree)
    all_vecs = {'all_time':[ [ [] for i in range(len(struct_profiles))] 
			     for j in range(len(clades)) ],
		'all_mut':[ [ [] for i in range(len(struct_profiles))] 
			     for j in range(len(clades)) ],
		'fiftyfifty':[ [ [] for i in range(len(struct_profiles))] 
			     for j in range(len(clades)) ]}

    aamuts, aatimes, aairr, aagaps = [], [], [], []
    for idx_clade, c in enumerate(clades):
        if len(c.get_terminals()) < 3:
		print 'SKIPPPING CUZ SUBTREE TOO SMALL'
		continue
	c_ids = [ n.m['seq'].name for n in c.get_terminals() ]
	if len(nonzero(greater([len(list(g)) for k, g in it.groupby(sorted(c_ids))],1))[0])>0:
		print 'SKIPPING CUZ THERE ARE TWO COPIES OF SOME F*****G SEQUENCE IN TREE'
		continue          
        all_muts, all_times , all_gaps, all_irr = [], [], [], []
	print
	print 'Clade: {0}'.format(idx_clade)
        for idx_struct, struct_info in enumerate( zip( struct_profiles, exemplar_structs)):
          struct_profile, ex_struct = struct_info
	  ngaps = 0

          #OLD ALIGNMENTS
          calis = ba.MultipleSeqAlignment(\
              [n.m['seq'] for n in c.get_terminals() ])
          #NEW ALIGNMENTS AND REF STRUCTURE
          c_new_ali , stk, struct = infernal.alignment(calis, struct_profile, rfid)
          #REF STRUCTURE PAIRS
          pairs = rutils.stk_pairs(struct)
	  if len(pairs) != len(ex_struct):
		  raise Exception()
           
          cterms = c.get_terminals()
          for i2, ct in enumerate(cterms):
              lilid =  'N{0}'.format(i2)
              ct.name = lilid
              ct.m['str_seq'] = c_new_ali[i2]
              ct.m['str_seq'].id = lilid
	      ct.m['probs'] = ones(len(c_new_ali[i2]))
          
          #BUILD A TREE
          tr = phy.BaseTree.Tree(c)

          #RUN PAML
          paml_run_id = 'ali_anc_c{0:04}_s{0:03}'.format(idx_clade,idx_struct)
          rstfile= paml.run_paml(tr, c_new_ali, run_id = paml_run_id)
          anc_tree = paml.rst_parser(rstfile) 

          #Label extent and internal nodes with sequences.
          for term in anc_tree.get_terminals():
              #Terminals have old (rfam) alis and new (infernal) alis
              term.m = filter( lambda x: x.name == term.name, cterms)[0].m
          for node in anc_tree.get_nonterminals():
              #Internals only have new alis. m['seq'] = m['str_seq']
              node.m['str_seq'] = node.m['seq']
              node.m['str_seq'].seq = node.m['str_seq'].seq.replace('T', 'U')
          subtree = anc_tree
              
 
          #Evaluate all of the structs on the first pass
          #to have access to mean frequencies of different
          #mutational types in the final score computation
	  
          refnode, refseq = subtree_refseq(subtree, method = refseq_method)
          muts, times, gaps, irresolvables = subtree_count_struct(subtree, pairs)
          all_muts.append(muts)
          all_times.append(times)
	  all_gaps.append(gaps)
	  all_irr.append(irresolvables)
        
	compute_signatures(all_vecs,idx_clade,
			   all_muts,all_times,
			   exemplar_structs,ungapped_ref )
				      
	aamuts.append(all_muts)
	aatimes.append(all_times)
	aairr.append(all_irr)
	aagaps.append(all_gaps)
    outputs = {
	    'all_vecs':all_vecs,
	    'all_muts':aamuts,
	    'all_times':aatimes,
	    'exemplar_structs':exemplar_structs,
	    'reference_seq':ungapped_ref,
	    'thermo_ex_inds':inds,
	    'thermo_embedding':pca_vecs,
	    'title':title,
	    'thermo_aff_type':aff_type,
	    'tree':tree,
	    'run_id':run_id
	    }
	 
    pickle.dump(outputs, open(cfg.dataPath('cs874/runs/{0}.pickle'.format(run_id)),'w'))
    return(outputs)
Esempio n. 3
0
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True,
        draw_distances = draw_all_easy,
        draw_clusters = draw_all_easy,
        draw_single_cluster = draw_all_hard):
    '''
Run the tree computation for each clsuter in the rfam family.
(Or just one)

1) Compute clusters using a distance measure derived either 
   phyml or a simple levenshtein dist.

   kwds:
     tree          [True]  Use a tree or just a levenshtein 
                           distance to get distances for
                           init clustering.

2) Choose a cluster of well related sequences and for this 
   this cluster, compute an alignment (For each structure 
   using phase or for sequences using MUSCLE)
  
   kwds:
     struct_align  [True]   Whether to compute structural 
                            alignments or use MUSCLE

'''
    rutils = utils

    ali, tree, infos = rfam.get_fam(rfid)
    n = len(ali)

    if draw_distances:
        dists_t = seq_dists(ali,rfid, tree = True)
        dists_l = seq_dists(ali,rfid, tree = False)
        dtf = dists_t.flatten()
        dlf = dists_l.flatten()
        lin = linregress(dtf, dlf)
        rsquared = lin[2]**2

        f = myplots.fignum(5, (7,7))
        ax = f.add_subplot(111)
        ax.annotate('Levenshtein distance vs. BioNJ branch lengths',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('R-Squared: {0}'.format(rsquared),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('BIONJ Tree ML Distance')
        ax.set_ylabel('Levenshtein Distance')

        ax.scatter(dtf, dlf, 100)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff')
        f.savefig(datafile)
        
    dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid,
                         register = rfid, 
                         on_fail = 'compute',
                         reset = reset)
    
    clusters = maxclust_dists(dists, k = 5, method = 'complete')
    clusters -= 1

    if draw_clusters:

        ct = mycolors.getct(len(set(clusters)))
        colors = [ct[elt] for elt in clusters]
        pca_vecs = mlab.PCA(dists).project(dists) 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of Clusters: {0}'.format(len(ct)),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 1')
        ax.set_ylabel('PC 2')

        ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps')
        f.savefig(datafile)        

    #now take the largest cluster and do the analysis.
    
    cgrps = dict([ (k, list(g)) 
              for k , g  in it.groupby(\
                sorted( list(enumerate(clusters)),key = lambda x: x[1]),
                key = lambda x: x[1])])
    cbig = argmax([len(x) for x in cgrps.values()])
    cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] 
    csize = len(cluster_seqs)
    seqs =[ali[c] for c in cluster_seqs]

    
    
    if 0:

        ct = mycolors.getct(2)
        pca_vecs = mlab.PCA(dists).project(dists) 
        colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n  - csize),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 0')
        ax.set_ylabel('Distance')


        for s in cluster_seqs:
            ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2),  color = colors, alpha = .2)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps')
        f.savefig(datafile)        
        
    clusters_final  = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))]
    seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final]
    return seqs_final