def get_reinitz_data(**kwargs): ofs = kwargs.get('ofs',0) do_plot_coords = kwargs.get('plot_coords',False) do_plot_vals = kwargs.get('plot_vals',False) idm= id_map() df = datafiles(**mem.rc(kwargs)) #I'm not sure exactly how this dataset works but #each nuclei has a bunch of numbers that appear to be #monotonically increasing. # #I just take the first instance. nums = dict([(k,v[:,0]) for k, v in df.iteritems()]) nuc_count = len(set(nums.values()[2])) values = dict([(k,v[nuc_count *ofs: nuc_count *(ofs + 1),-1]) for k, v in df.iteritems()]) coords = dict([(k,v[nuc_count *ofs :nuc_count *(ofs + 1),1:3]) for k, v in df.iteritems()]) #to check the basic consistency of the data, enable the plot routines. #I suppose that I could do this for all of the nuclei occurences... #right now, only the first is used. if do_plot_coords: f = myplots.fignum(1,(8,8)) ax = f.add_subplot(111) ct = mycolors.getct(len(values)) for i,k in enumerate(values.keys()): ax.scatter(coords[k][:,0][::1], coords[k][:,1][::1], 10, edgecolor = 'none', alpha = .25,c =ct[i], label = k, ) f.savefig(myplots.figpath( 'reinitz_exprdata_coords_nuc_offset={0}'.format(ofs))) if do_plot_vals: f = myplots.fignum(1,(8,8)) ax = f.add_subplot(111) ct = mycolors.getct(len(values)) for i,k in enumerate(values.keys()): ax.scatter(coords[k][:,0][::1], values[k][::1], 10, edgecolor = 'none',alpha = .25,c =ct[i], label = k, ) f.savefig(myplots.figpath( 'reinitz_exprdata_ap_vals_nuc_offset={0}'.format(ofs))) return coords, values
def show_subopts(structs, polys, energies): srted = argsort(energies) e = array(energies) cols = [1.,0.,0.] * ((e - min(e)) / (max(e) - min(e)))[:,newaxis] plf2 = myplots.fignum(7,(10,10)) rplots.grid_rnas(polys[srted], colors =cols[srted], size = (8,8), dims = [180,50])
def cluster_2_show(clusters, polys): sortorder = argsort(clusters) ct_colors = mycolors.getct(len(set(clusters))) ct_dict = dict([(cluster, ct_colors[i]) for i, cluster in enumerate(set(clusters))]) plf2 = myplots.fignum(8,(10,10)) rplots.grid_rnas(polys[sortorder], colors = [ct_dict[i] for i in clusters[sortorder]], size = (5,5), dims = [180,50])
def show_rna_structs(xvals, yvals, structs, energies, pfracs, rname, rtype,ns,rfid,figsize,colors, seq, n, selection_type,vert_idxs): verts = rutils.struct_verts([structs['structs'][i] for i in vert_idxs] ,seq,rfid) f = myplots.fignum(3,figsize) ax = f.add_subplot(111) myplots.padded_limits(ax, xvals, yvals, .2) for vi, v in enumerate(verts): i = vert_idxs[vi] dims = [30] shadow_width = 10 pkw0 = {'linewidth':shadow_width, 'color':'white', 'alpha':1, 'zorder':1.1} rplots.show_rna([xvals[i],yvals[i]], v, dims = dims, pkw = pkw0) pkw0 = {'linewidth':shadow_width, 'color':'white', 'alpha':.8, 'zorder':vi+2} rplots.show_rna([xvals[i],yvals[i]], v, dims = dims, pkw = pkw0) pkw1 = {'linewidth':2, 'color':colors[i], 'zorder':vi+2} rplots.show_rna([xvals[i],yvals[i]], v, dims = dims, pkw = pkw1) ax.set_ylabel('mutation score') ax.set_xlabel('free energy (-kCal)') ax.annotate('''Suboptimal foldings, positioned by energy and a mutation based evolutionary score. Color indicates a second score from paired BL.''' , [0,1],xycoords ='axes fraction', xytext = [10,-10], textcoords='offset pixels', va = 'top') f.savefig(figfile.format('{3}_frac_silent_{0}_{1}{2}'.\ format(rname,selection_type,n,rtype))) return vert_idxs
def show_rna_structs(xvals, yvals, structs, energies, pfracs, rname, rtype, ns, rfid, figsize, colors, seq, n, selection_type, vert_idxs): verts = rutils.struct_verts([structs['structs'][i] for i in vert_idxs], seq, rfid) f = myplots.fignum(3, figsize) ax = f.add_subplot(111) myplots.padded_limits(ax, xvals, yvals, .2) for vi, v in enumerate(verts): i = vert_idxs[vi] dims = [30] shadow_width = 10 pkw0 = { 'linewidth': shadow_width, 'color': 'white', 'alpha': 1, 'zorder': 1.1 } rplots.show_rna([xvals[i], yvals[i]], v, dims=dims, pkw=pkw0) pkw0 = { 'linewidth': shadow_width, 'color': 'white', 'alpha': .8, 'zorder': vi + 2 } rplots.show_rna([xvals[i], yvals[i]], v, dims=dims, pkw=pkw0) pkw1 = {'linewidth': 2, 'color': colors[i], 'zorder': vi + 2} rplots.show_rna([xvals[i], yvals[i]], v, dims=dims, pkw=pkw1) ax.set_ylabel('mutation score') ax.set_xlabel('free energy (-kCal)') ax.annotate('''Suboptimal foldings, positioned by energy and a mutation based evolutionary score. Color indicates a second score from paired BL.''', [0, 1], xycoords='axes fraction', xytext=[10, -10], textcoords='offset pixels', va='top') f.savefig(figfile.format('{3}_frac_silent_{0}_{1}{2}'.\ format(rname,selection_type,n,rtype))) return vert_idxs
def view2(): files = [l for l in os.listdir(cfg.dataPath("batch/outputs")) if "mcmc" in l] ids = [l[0:10] for l in files] ids = ids[::10] inps = [butils.load_data(i, "input") for i in ids] outs = [butils.load_data(i, "output") for i in ids] # idxs_good = nonzero(greater([elt.get('improve_ratio') for elt in outs],, .2 )[0] idxs_good = range(len(outs)) outs = [o for idx, o in enumerate(outs) if idx in idxs_good] inps = [i for idx, i in enumerate(inps) if idx in idxs_good] params = inps[0].keys() f = myplots.fignum(1, (8, 8)) params = params for i, p in enumerate(params): ax = f.add_axes([0.05, i * (1.0 / len(params)), 0.9, 1.0 / len(params)], title=p) # ax.set_yticks([]) # ax.set_xticks([]) xvals = [elt.get(p) for elt in inps] if type(xvals[0]) == str: continue yvals = [elt.get("improve_ratio") for elt in outs] yvals2 = [elt.get("stay_same") for elt in outs] yvals += random.rand(*shape(yvals)) * (max(yvals) - min(yvals)) / 50 yvals2 += random.rand(*shape(yvals)) * (max(yvals) - min(yvals)) / 50 xvals += random.rand(*shape(xvals)) * (max(xvals) - min(xvals)) / 50 ax.scatter(xvals, yvals) # ax.scatter(xvals , yvals + yvals2, 25, color = 'red') ax.annotate(p, [0, 0], xycoords="axes fraction", ha="left", va="bottom") f.savefig(cfg.dataPath("figs/soheil/broad_run0_psplits.ps")) raise Exception() return inps
def tree_similarity(dist1, dist2, run_id,criterion = 'knn', k = 6): if criterion == 'knn': nq = len(dist1) nb1 = argsort(dist1, 1)[:,1:k+1] nb2 = argsort(dist2, 1)[:,1:k+1] all_nbs = [set(n1).union(set(n2)) for n1, n2 in zip(nb1, nb2)] nb_intersection = [set(n1).intersection(set(n2)) for n1, n2 in zip(nb1, nb2)] nb_dists = [ array([[dist1[i, n], dist2[i,n]]for n in nbs ]) for i,nbs in enumerate(all_nbs)] #take the first k distances. n_disagreements = [len(nbd) - k for nbd in nb_dists] nb_dists = array([ sorted(nbd, key = lambda x: min(x))[:k] for nbd in nb_dists]) frac_diffs = [abs(diff(elt, 1).flatten()) / mean(elt,1) for elt in nb_dists] abs_diffs = [abs(diff(elt, 1).flatten()) for elt in nb_dists] ct = mycolors.getct(nq) f = myplots.fignum(4, (10,8)) ax = f.add_axes([.05,.08,.25,.87]) seismic.seismic(abs_diffs, ax = ax, colors = ct) jaccard = mean([float(len(nb_intersection[i])) / float(len(all_nbs[i])) for i in range(nq)]) ax2 = f.add_axes([.34,.08,.6,.87]) for i,d in enumerate(nb_dists): ax2.scatter(d[:,0], d[:,1], 20, alpha = .5,color =ct[i]) lin = linregress(nb_dists[:,:,0].flatten(),nb_dists[:,:,1].flatten()) rsquared = lin[2]**2 ax2.annotate('NN dists for multi/struct-aligned trees.\nK = {0}'.format(k), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax2.annotate('R-Squared: {0:3.3}\nJaccard Index: {1:3.3}'.format(rsquared, mean(jaccard)), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax2.set_xlabel('Muscle aligned tree distances') ax2.set_ylabel('Struct algined tree distances') datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_dists_{0}_k{1}.tiff'.format(run_id, k)) f.savefig(datafile)
def show_errors(errors, staysames, improves, gnames): figtitle = "show_errors" f = myplots.fignum(3, (12, 6)) ax = f.add_axes([0.05, 0.05, 0.25, 0.9]) import scipy.signal as ss for all_errs in errors[0:1]: for e in all_errs.flatten()[:5]: ax.plot(ss.medfilt(e.flatten() ** 2, 51)) get_worse = 1 - (array(staysames) + array(improves)) ax2 = f.add_axes([0.3, 0.05, 0.65, 0.9]) seismic.seismic( squeeze([get_worse, staysames, improves]), stacked=True, colors=[[1, 0, 0], [0, 0, 0], [0, 0, 1]], ax=ax2, linewidth=10, label_y=False, ) f.savefig(figtemplate.format(figtitle))
def check_network(net_name = 'binding', dataset_name = 'reinitz', data_ofs = 4, max_edges = -1, node_restriction = 'reinitz'): reinitz_keys =set( get_reinitz_data()[1].keys()) if dataset_name == 'reinitz': coords, values = get_reinitz_data(ofs = data_ofs) elif dataset_name == 'bdtnp': data = nio.getBDTNP() meta = nio.getBDTNP(misc = True) values = dict([( k, v['vals'][:,data_ofs] ) for k,v in data.iteritems()]) coords = array([meta['x']['vals'][:,data_ofs],meta['y']['vals'][:,data_ofs]]) elif dataset_name == 'tc': data = nio.getTC() if node_restriction == 'reinitz': data = dict([(k,v) for k,v in data.iteritems() if k in reinitz_keys]) #values = dict([( k, v['vals'][:,data_ofs] ) for k,v in data.iteritems()]) #coords = array([meta['x']['vals'][:,data_ofs],meta['y']['vals'][:,data_ofs]]) values = data else: raise Exception('data set {0} not yet implemented'.format(dataset_name)) nets = comp.get_graphs() if net_name == 'binding': network = nets['bn'] elif net_name == 'unsup': network = nets['unsup'] elif net_name == 'logistic': network = nets['logistic'] elif net_name =='clusters': network = get_soheil_network(max_edges = max_edges, node_restriction = values.keys()) else: raise Exception('type not implemented: {0}'.format(net_name)) nodes = values.keys() nodes_allowed = set(nodes) f = myplots.fignum(1,(8,8)) ax = f.add_subplot(111) targets = {} edges = [] for n in nodes: targets[n] = [] if n in network: targets[n] = nodes_allowed.intersection(network[n].keys()) xax = linspace(-1,1,20) edges = list(it.chain(*[[(e,v2) for v2 in v] for e, v in targets.iteritems()])) ccofs = [e for e in [ corrcoef(values[tf], values[tg])[0,1] for tf, tg in edges] if not isnan(e)] count, kde = make_kde(ccofs) ax.hist(ccofs,xax,label = net_name) h =histogram(ccofs,xax) ax.fill_between(xax,kde(xax)*max(h[0]),label = net_name,zorder = 1,alpha = .5) myplots.maketitle(ax,'edge correlations kde for {0}'.format('\n{2} data (data offset={0})\n(net_name={1})\n(max_edges={3})' .format(data_ofs, net_name, dataset_name, max_edges) ),\ subtitle = 'n_edges = {0}'.format(len(edges))) ax.legend() f.savefig(myplots.figpath('network_edge_corrs_data_ofs={0}_net={1}_expr={2}_max_edges={3}' .format(data_ofs,net_name,dataset_name, max_edges)))
def srt_heatmap(net = 3, all_module = False): import compbio.projects.bsort.bsort as bs arr, cols, rows = load(net = net, max_go_modules = 15, min_go_size = 5, min_module_size = 10) arr2_510, srts = bs.run0(arr = arr, itr = 2, meth = 'moment') arr2_510 = arr2_510[:,::-1] csrts = [s for s in srts if len(s) == len(cols)] rsrts = [s for s in srts if len(s) == len(rows)] c0 = array(cols) r0 = array(rows) for c in csrts: cols = cols[c] for r in rsrts: rows = rows[r] fopen =open( cfg.dataPath('daniel/heatmaps_sorted/hm_net{0}.txt'.format(net)), 'w') fopen.write('FORMAT: L1 :GO Terms (Columns), L2: Modules (Rows), L3+ Pvals thresholded between .01, .001\n') fopen.write('\t'.join([str(elt) for elt in cols]) + '\n') fopen.write('\t'.join([str(squeeze(elt)) for elt in rows]) + '\n') dmat = arr2_510 for row in dmat: fopen.write('\t'.join(['{0}'.format(elt) for elt in row])+'\n') fopen.close() f = myplots.fignum(3, (8,40)) if net == 3 else myplots.fignum(3, (8,10)) ax = f.add_axes([.4,.05,.55,.9], aspect = 'auto') goterms = [str(elt) for elt in cols] if not all_module: idx_omit = goterms.index('all') arr2_510 = vstack((arr2_510[:idx_omit],arr2_510[idx_omit+1:])) goterms = goterms[:idx_omit] + goterms[idx_omit+1:] fopen = open(cfg.dataPath('daniel/go_accession_name_map.txt')) gotexts = {} for l in fopen.xreadlines(): k,v = l.split('\t') gotexts[k] = v row_labels = [] for g in goterms: if g in gotexts.keys(): row_labels.append(gotexts[g].strip()) else: row_labels.append(g.strip()) ax.set_yticks(arange(len(goterms))+.25) ax.set_yticklabels(row_labels, size = '4') ax.set_xticks([]) ax.set_xlabel('Modules') ax.set_xticks(arange(len(rows))+.25) ax.set_xticklabels([str(int(r[0])) for r in rows], rotation= 90, size = 'small') cm = mycolors.blackbody(flip = True) im =ax.imshow(arr2_510[:,:] * 4 + 2, cmap = plt.get_cmap('OrRd'), aspect = 'auto', interpolation = 'nearest' ) plt.colorbar(im) f.savefig(cfg.dataPath('daniel/heatmaps_sorted/hm_net{0}_{1}.pdf'.\ format(net, 'with_all' if all_module else 'no_all')))
def show_conservation(fidx = 0, reset = False): fnum = flist[fidx] rfid = 'RF{0:05}'.format(fnum) print rfid if fnum ==50: ftype = 'riboswitch' else: ftype = 'all' out = mem.getOrSet(setFamData, **mem.rc({}, reset =reset, on_fail = 'compute', hardcopy = False, register = 'fdat'+rfid, ftype = ftype, rfid = rfid)) mvals, tvals, structs = mem.getOrSet(setTree, **mem.rc({},reset = reset, on_fail = 'compute', hardcopy = True, register = 'st'+rfid, rfid = rfid, ftype = ftype)) idxs, tidx = sutils.show_paired_v_energy(rfid,rfid,mvals,tvals,structs,ftype) all_pairs = structs['structs'] all_energies = structs['energies'] pints,eints, mints, tints = [structs['structs'][i] for i in idxs],\ [ structs['energies'][i] for i in idxs],\ [ mvals[tidx][i] for i in idxs],\ [ tvals[tidx][i] for i in idxs] seq = structs['seq'] if do_make_subopts: subopts = rutils.suboptimals(seq, n = 400) verts = rutils.struct_verts(subopts, seq, rfid) f = myplots.fignum(4,figsize) rplots.grid_rnas(verts, dims = [40]) f.savefig(figfile.format('{0}_grid_rnas'.\ format(rfid))) aff = rutils.struct_affinity_matrix(all_pairs, len(seq)) pca = rutils.project_structs(all_pairs, ptype ='pca', affinities = aff, n_comp = 3) for metric in ['n_comp']:# ['frac_silent','frac_paired','n_comp']: scolors = [] for i in range(len(tvals[tidx])): m_silent, pidxs, frac_good = sutils.metric( mvals[tidx][i],tvals[tidx][i], mtype = metric) scolors.append(mean(m_silent)) scolors = myplots.rescale(scolors, [0.,1.])[:,newaxis] * array([1.,0.,0.]) f = myplots.fignum(4,figsize) ax = f.add_subplot(111) xvals, yvals = pca[:,:2].T myplots.padded_limits(ax, xvals, yvals) ax.scatter(xvals,yvals,300,linewidth = 1, edgecolor = 'black', color = scolors) ax.scatter(pca[idxs,0],pca[idxs,1], 2100 ,alpha = 1, color = 'black') ax.scatter(pca[idxs,0],pca[idxs,1], 2000 ,alpha = 1, color = 'white') ax.scatter(pca[idxs,0],pca[idxs,1], 400 ,alpha = 1, color = scolors[idxs], ) ax.annotate('''Conservation metric: {0} Projected onto C=2 Principal Components'''.format(metric), [0,1],xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords='offset points') f.savefig(figfile.format('{0}_pca_{1}'.\ format(rfid, metric)))
def view3(): files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l] fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files] ids = [l[0:10] for l in files] inps = [butils.load_data(i, "input") for i in ids] idxs_good = nonzero(greater([elt.get("out_iter_num") for elt in inps], 2))[0] inps = [inps[i] for i in idxs_good] fpaths = [fpaths[i] for i in idxs_good] fig = myplots.fignum(3, (35, 15)) ax = fig.add_axes([0, 0, 1, 1]) for f, inp in zip(fpaths, inps): if inp["out_iter_num"] == 2: continue print inp["filename"] data = sio.loadmat(f) import compbio.utils.colors as mycolors ct = mycolors.getct(len(data["gene_names"])) term_list = [list(it.chain(*mod)) for mod in data["model"]] fac_list = [list(it.chain(*t)) for t in term_list] xvals, yvals, colors, rads = [], [], [], [] for i, terms in enumerate(term_list): for j, term in enumerate(terms): for k, fact in enumerate(term): xvals.extend([i] * len(term)) yvals.extend([fact] * len(term)) colors.extend([ct[c] for c in sorted(term)]) rads.extend(((arange(1, len(term) + 1) ** 2) * 50)[::-1]) vecs = zeros((len(fac_list), len(fac_list))) for i, fl in enumerate(fac_list): for f in fl: vecs[i, f] = 1 # plt.imshow(vecs) # ax1 = fig.add_subplot(121) # ax2 = fig.add_subplot(122) import hcluster clusters = hcluster.fclusterdata(vecs, 1.1, criterion="inconsistent", method="complete") # ax1.imshow(vecs) # ax2.imshow(vecs[argsort(clusters)]) # raise Exception() csrt = argsort(argsort(clusters)) xvals2 = [csrt[x] for x in xvals] # raise Exception() plt.scatter(xvals2, yvals, rads, color=colors) raise Exception() raise Exception()
def view4_show0(cnames, xvals, gvals, yvals, colors, rads, l_info, gnum=59): seen = set() offs_mag = 0.3 xofs, xnew, yofs, ynew = [], [], [], [] # [xv for xv in xvals], [yv for yv in yvals] for v in zip(xvals, yvals, gvals): xy0 = v[0][0], v[1][0] xy = tuple([x for x in xy0]) # check to see if the current xy has been seen. # if so increment until unique. while 1: if xy in seen: xy = tuple([elt + offs_mag for elt in xy]) else: break xnew.append([xy[0] for i in range(len(v[0]))]) ynew.append([xy[1] for i in range(len(v[0]))]) xofs.append([xy[0] - xy0[0] for i in range(len(v[0]))]) yofs.append([xy[1] - xy0[1] for i in range(len(v[0]))]) if v[2][0] != gnum: continue else: seen.add(xy) g_equal = nonzero(equal([x for x in it.chain(*gvals)], gnum))[0] if len(g_equal) == 0: print "G {0} appears to not be in the list".format(gnum) gset = set(g_equal) xvals_old = xvals yvals_old = yvals xvals = xnew yvals = ynew xvals = array(list(it.chain(*xvals)))[g_equal] yvals = array(list(it.chain(*yvals)))[g_equal] xvals_old = array(list(it.chain(*xvals_old)))[g_equal] yvals_old = array(list(it.chain(*yvals_old)))[g_equal] xofs = array(list(it.chain(*xofs)))[g_equal] yofs = array(list(it.chain(*yofs)))[g_equal] colors = array(list(it.chain(*colors)))[g_equal] rads = array(list(it.chain(*rads)))[g_equal] vecs = zeros((max(xvals_old) + 1, max(yvals_old) + 1)) for x, y in zip(xvals_old, yvals_old): vecs[x, y] = 1.0 # import hcluster # clusters = hcluster.fclusterdata(vecs,.1,criterion='inconsistent',method = 'complete' ) import mlpy HC = mlpy.HCluster(method="euclidean", link="complete") clusts = HC.compute(vecs) k = 15 cut = HC.cut(HC.heights[-k]) cut_s = sort(cut) crank = argsort(argsort(cut)) fig = myplots.fignum(3, (35, 15)) ax = fig.add_axes([0, 0, 1, 1]) clst_mems = [cut_s[c] for c in crank] clust_colors = ones(3) * linspace(0.2, 0.9, len(set(cut)))[:, newaxis] ax.scatter(crank, ones(len(crank)) * 0.5, 1000, color=clust_colors[clst_mems]) ax.scatter(crank[xvals_old] + xofs, yvals, rads, color=colors) ax.annotate( "Functional motifs for gene: {0}\nIn {1} clusters".format("gene", 100), [0, 1], xycoords="axes fraction", va="top", ) figname = "gene_{0}_motif_recurrence_circles".format(gnum) fig.savefig(figtemplate.format(figname))
outputs = [sio.loadmat(cfg.dataPath("batch/tmp/mcmc_{0:05}_tmp001.mat".format(num))) for num in fnums] douts = [] for output in outputs: try: o00 = output["out_struct"][0][0] dout = dict([(k, o00[i]) for i, k in enumerate([elt[0] for elt in o00.dtype.descr])]) douts.append(dout) except Exception, e: continue ss, ir = array([(squeeze(o["stay_same"]), squeeze(o["improve_ratio"])) for o in douts]).T ss += random.rand(*shape(ss)) / 100 ir += random.rand(*shape(ir)) / 100 f = myplots.fignum(1, (8, 8)) f.clear() ax = f.add_subplot(111) ax.set_xlabel("Stay Same") ax.set_ylabel("Improve Ratio") plt.scatter(ss, ir, 5) def view2(): files = [l for l in os.listdir(cfg.dataPath("batch/outputs")) if "mcmc" in l] ids = [l[0:10] for l in files] ids = ids[::10] inps = [butils.load_data(i, "input") for i in ids] outs = [butils.load_data(i, "output") for i in ids]
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True): print 'computing alignments...' print ' ...using muscle' malis, mrefs, mpairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'muscle', reset = reset, on_fail = 'compute', register = 'tuali_musc_{0}'.format(run_id))) print ' ...using cmalign.' salis, srefs, spairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali__struct_{0}'.format(run_id))) print ' ...making trees.' for idx, alis in enumerate(zip(malis, salis)): m, s = alis mtree = phyml.tree(m,run_id, bionj = True) stree = phyml.tree(s,run_id, bionj = True) maps = dict([(elt.id,i) for i, elt in enumerate(m)]) mdists = zeros((len(maps),len(maps))) sdists = zeros((len(maps),len(maps))) for n1 in mtree.get_terminals(): for n2 in mtree.get_terminals(): mdists[maps[n1.name],maps[n2.name]] = \ mtree.distance(n1,n2) for n1 in stree.get_terminals(): for n2 in stree.get_terminals(): sdists[maps[n1.name],maps[n2.name]] = \ stree.distance(n1,n2) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1)) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6) f = myplots.fignum(4, (8,10)) ct = mycolors.getct(len(mtree.get_terminals())) import networkx for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']): a = f.add_subplot(sp) layout = 'neato' G = phylo.to_networkx(t) Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False) posi = networkx.pygraphviz_layout(Gi, layout, args = '') posn = dict((n, posi[Gi.node_labels[n]]) for n in G) networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]), node_size = [100 if n.name in maps.keys() else 0 for n in G.nodes()], width = 1, edge_color = 'black', ax = a, node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] ) a.annotate('Embedded tree for {0} alignment.'.format(ttype), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,0],textcoords = 'offset pixels') a.annotate('Total branch length is {0}'.format(t.total_branch_length()), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') #phylo.draw_graphviz( mtree, label_func = lambda x: '', # node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\ # [ct[0] for n in mtree.get_nonterminals()], axes = ax) datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx)) f.savefig(datafile, dpi = 200, format = 'ps')
def show_paired_v_energy(rname,rfid, all_muts, all_times, structs,rtype): if all_times == {}: return resolved_frac = [ mean(list(it.chain(*[s_times['frac_resolved'] for s_times in t_times.values()]))) for t_times in all_times.values()] total_lens = [mean(list(it.chain(*[s_times['total_time'] for s_times in t_times.values()]))) for t_times in all_times.values()] total_lens_res = [mean(list(it.chain(*[s_times['total_time_res'] for s_times in t_times.values()]))) for t_times in all_times.values()] focus_tree = all_times.keys()[argmax(total_lens_res)] muts = all_muts[focus_tree] times = all_times[focus_tree] ns = len(muts.keys()) s2 = dict(structs) s2['energies'] = s2['energies'][:ns] s2['structs'] = s2['structs'][:ns] structs = s2 energies = structs['energies'] f = myplots.fignum(3,figsize) xvals, yvals , pfracs, ugfracs = [], [], [], [] for i, vals in enumerate(zip(muts.values(),times.values())): mvals, tvals = vals xvals.append( energies[i]) frac_ug = metric(mvals, tvals, 'frac_ug')[0] pfrac,pinds,frac_good = metric(mvals, tvals, 'frac_silent') sfrac = metric(mvals, tvals, 'frac_silent')[0] ugfracs.append(frac_ug) pfracs.append( mean(pfrac)) yvals.append( mean(sfrac)*frac_good) colors = array(pfracs) colors = (colors - min(colors)) /(max(colors) - min(colors)) colors = colors[:,newaxis] * [0,1,0] ax = f.add_subplot(111) ax.scatter(xvals,yvals,array(ugfracs) * 200, color = colors) ax.set_ylabel('mutation score') ax.set_xlabel('free energy (-kCal)') ax.annotate('''Evaluated structures positioned by energy and a mutation based evolutionary score. Color indicates fractional frequency of double mutants. Radius indicates percentage of ungapped base pairs.''' , [0,1],xycoords = 'axes fraction', xytext = [10,-10], textcoords='offset pixels', va = 'top') myplots.padded_limits(ax, xvals, yvals, .2) f.savefig(figfile.format('{1}_frac_double_{0}'.format(rname,rtype))) f.clear() colors = array(pfracs) colors = (colors - min(colors)) /(max(colors) - min(colors)) colors = colors[:,newaxis] * [1,0,0] f = myplots.fignum(3,figsize) xvals, yvals , pfracs, ugfracs = [], [], [], [] for i, vals in enumerate(zip(muts.values(),times.values())): mvals, tvals = vals xvals.append( energies[i]) frac_ug = metric(mvals, tvals, 'frac_ug')[0] pfrac,pinds,frac_good = metric(mvals, tvals, 'frac_paired') sfrac = metric(mvals, tvals, 'frac_silent')[0] ugfracs.append(frac_ug) pfracs.append( mean(pfrac)*frac_good) yvals.append( mean(sfrac)*frac_good) colors = array(pfracs) colors = (colors - min(colors)) /(max(colors) - min(colors)) colors = colors[:,newaxis] * [1,0,0] ax = f.add_subplot(111) ax.scatter(xvals,yvals,array(ugfracs) * 200, color = colors) ax.set_ylabel('mutation score') ax.set_xlabel('free energy (-kCal)') ax.annotate('''Evaluated structures positioned by energy and a mutation based evolutionary score. Color indicates a second score from paired BL. Radius indicates percentage of ungapped base pairs.''' , [0,1], xycoords = 'axes fraction', xytext = [10,-10], textcoords='offset pixels', va = 'top') myplots.padded_limits(ax, xvals, yvals, .2) f.savefig(figfile.format('{1}_frac_silent_{0}'.format(rname,rtype))) seq = structs['seq'] n, selection_type = [4,'both'] idxs = get_interesting_inds(xvals, yvals, structs, energies, pfracs, rname, rtype, ns,rfid,figsize, colors, seq,n,selection_type) if draw_single: show_rna_structs(xvals, yvals, structs, energies, pfracs, rname, rtype, ns,rfid,figsize, colors, seq,n,selection_type, idxs) if draw_many: for n, selection_type in \ [[5,'ptime'],[5,'energy'],[ns,'energy']]: m_idxs = get_interesting_inds(xvals, yvals, structs, energies, pfracs, rname, rtype, ns,rfid,figsize, colors, seq,n,selection_type) show_rna_structs(xvals, yvals, structs, energies, pfracs, rname, rtype, ns,rfid,figsize, colors, seq,n,selection_type, m_idxs) return idxs,focus_tree
def old_clusters(): plf = myplots.fignum(6, (8,8)) plf.clear() ax = plf.add_subplot(211) if do_rnd: hstart = .15 else: hstart = 3. all_vars = [] all_vars_n = [] all_clusters = [] ks = [] all_Bvars, all_Wvars = [], [] theights = log(HC.heights[greater(HC.heights,hstart)][-len(cvecs)/2:][:-1]) for xval, h in enumerate(theights): clustering = HC.cut(exp(h)) casrt = argsort(clustering) csrtd = clustering[casrt] d = dict([(k,array(list(g))) for k, g in it.groupby(zip(casrt,csrtd), key = lambda x: x[1])]) lens = array([len(v) for v in d.values()],float) nlens = lens / max(lens) cmeans = array([mean(cvecs[idxs[:,0],:],0) for idxs in d.values()]) Wvars = np.sum(array([np.mean( (cvecs[idxs[:,0],:] - cmeans[i]) **2 ) for i, idxs in enumerate(d.values())])) Bvars = np.sum(lens[:,newaxis]* ( (cmeans - mean(cvecs,0)) **2) ) ks.append(len(d)) all_Wvars.append(Wvars) all_Bvars.append(Bvars) cluster_vars =array([ sum(var(cvecs[idxs[:,0],:],0)) for idxs in d.values() ]) cluster_vars_n =array([ sum(var(cvecs[idxs[:,0],:],0)) for idxs in d.values() ])/(lens) all_clusters.append([cvecs[idxs[:,0]] for idxs in d.values() ]) all_vars.append(cluster_vars) all_vars_n.append(cluster_vars_n) colors = array(argsort(argsort(lens)),float)/len(lens) ax.scatter(0*(cluster_vars) + h , cluster_vars_n, 20, color = array(array([0.,1.,0.]) * colors[:,newaxis])) ax2 = plf.add_subplot(212) all_Bvars = array(all_Bvars) all_Wvars = array(all_Wvars) density_based = False HC_based = True if density_based: #ax3 = plf.add_subplot(212,frameon = False) #COMPUTE A COALESCENCE RATE OF CLUSTERS #(divide the pde for heights by clustering size) from scipy.stats import gaussian_kde data = (theights) density = gaussian_kde(data) density_rate = 1. / array([len(v) for v in all_vars]) xs = (theights) density.covariance_factor = lambda : .25 density._compute_covariance() yvals , colors = array([density(xs), density(xs) * density_rate, [sum(v/[len(c) for c in cs] ) for v,cs in zip(all_vars,all_clusters) ]]),\ [[1,0,0],[0,1,0],[0,0,1]] yvals /= np.max(yvals,1)[:,newaxis] xvals = (xs) + zeros(len(yvals))[:,newaxis] #for i in range(len(yvals)): # ax2.plot(xvals[i],yvals[i], color = colors[i]) dens_n = yvals[1] #dens_n[greater(dens_n,percentile(dens_n,60))] = percentile(dens_n,60) #dens_n[less(dens_n, percentile(dens_n,.25))] = percentile(dens_n,25) diff = dens_n - yvals[2] ax2.plot(xvals[0], yvals[1] - yvals[2], linewidth = 10) import scipy.signal as ss #diff =ss.medfilt(diff,3) ax2.plot(xvals[0], diff, linewidth = 5, color = 'orange') mpt = argmin(diff) #m#pt = len(yvals[0]) - 3 ax2.scatter([xvals[0][mpt]],diff[mpt], 200, color = 'red') ax2.plot(xvals[0],yvals[1]) #ax2.plot(histogram(log(HC.heights[greater(HC.heights,hstart)]))[1][:-1], # histogram(log(HC.heights[greater(HC.heights,hstart)]))[0]) #raise Exception() else: ax2.plot(ks,all_Bvars) ax2.plot(ks,all_Wvars) hcfun = array(all_Bvars) / array(all_Wvars) /\ ((array(ks,float) -1) / array(float(len(cvecs)) - array(ks,float))) hcfun = nan_to_num(hcfun) mpt = argmax(hcfun) a3 = plf.add_subplot(212, frameon = False) a3.plot(ks, hcfun) ax2.scatter([ks[mpt]]*2, [all_Bvars[mpt],all_Wvars[mpt]], 200, color = 'orange')
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True, draw_distances = draw_all_easy, draw_clusters = draw_all_easy, draw_single_cluster = draw_all_hard): ''' Run the tree computation for each clsuter in the rfam family. (Or just one) 1) Compute clusters using a distance measure derived either phyml or a simple levenshtein dist. kwds: tree [True] Use a tree or just a levenshtein distance to get distances for init clustering. 2) Choose a cluster of well related sequences and for this this cluster, compute an alignment (For each structure using phase or for sequences using MUSCLE) kwds: struct_align [True] Whether to compute structural alignments or use MUSCLE ''' rutils = utils ali, tree, infos = rfam.get_fam(rfid) n = len(ali) if draw_distances: dists_t = seq_dists(ali,rfid, tree = True) dists_l = seq_dists(ali,rfid, tree = False) dtf = dists_t.flatten() dlf = dists_l.flatten() lin = linregress(dtf, dlf) rsquared = lin[2]**2 f = myplots.fignum(5, (7,7)) ax = f.add_subplot(111) ax.annotate('Levenshtein distance vs. BioNJ branch lengths', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('R-Squared: {0}'.format(rsquared), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('BIONJ Tree ML Distance') ax.set_ylabel('Levenshtein Distance') ax.scatter(dtf, dlf, 100) datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff') f.savefig(datafile) dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid, register = rfid, on_fail = 'compute', reset = reset) clusters = maxclust_dists(dists, k = 5, method = 'complete') clusters -= 1 if draw_clusters: ct = mycolors.getct(len(set(clusters))) colors = [ct[elt] for elt in clusters] pca_vecs = mlab.PCA(dists).project(dists) f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of Clusters: {0}'.format(len(ct)), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 1') ax.set_ylabel('PC 2') ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors) datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps') f.savefig(datafile) #now take the largest cluster and do the analysis. cgrps = dict([ (k, list(g)) for k , g in it.groupby(\ sorted( list(enumerate(clusters)),key = lambda x: x[1]), key = lambda x: x[1])]) cbig = argmax([len(x) for x in cgrps.values()]) cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] csize = len(cluster_seqs) seqs =[ali[c] for c in cluster_seqs] if 0: ct = mycolors.getct(2) pca_vecs = mlab.PCA(dists).project(dists) colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n - csize), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 0') ax.set_ylabel('Distance') for s in cluster_seqs: ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2), color = colors, alpha = .2) datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps') f.savefig(datafile) clusters_final = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))] seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final] return seqs_final
def show_paired_v_energy(rname, rfid, all_muts, all_times, structs, rtype): if all_times == {}: return resolved_frac = [ mean( list( it.chain( * [s_times['frac_resolved'] for s_times in t_times.values()]))) for t_times in all_times.values() ] total_lens = [ mean( list( it.chain( *[s_times['total_time'] for s_times in t_times.values()]))) for t_times in all_times.values() ] total_lens_res = [ mean( list( it.chain(*[ s_times['total_time_res'] for s_times in t_times.values() ]))) for t_times in all_times.values() ] focus_tree = all_times.keys()[argmax(total_lens_res)] muts = all_muts[focus_tree] times = all_times[focus_tree] ns = len(muts.keys()) s2 = dict(structs) s2['energies'] = s2['energies'][:ns] s2['structs'] = s2['structs'][:ns] structs = s2 energies = structs['energies'] f = myplots.fignum(3, figsize) xvals, yvals, pfracs, ugfracs = [], [], [], [] for i, vals in enumerate(zip(muts.values(), times.values())): mvals, tvals = vals xvals.append(energies[i]) frac_ug = metric(mvals, tvals, 'frac_ug')[0] pfrac, pinds, frac_good = metric(mvals, tvals, 'frac_silent') sfrac = metric(mvals, tvals, 'frac_silent')[0] ugfracs.append(frac_ug) pfracs.append(mean(pfrac)) yvals.append(mean(sfrac) * frac_good) colors = array(pfracs) colors = (colors - min(colors)) / (max(colors) - min(colors)) colors = colors[:, newaxis] * [0, 1, 0] ax = f.add_subplot(111) ax.scatter(xvals, yvals, array(ugfracs) * 200, color=colors) ax.set_ylabel('mutation score') ax.set_xlabel('free energy (-kCal)') ax.annotate('''Evaluated structures positioned by energy and a mutation based evolutionary score. Color indicates fractional frequency of double mutants. Radius indicates percentage of ungapped base pairs.''', [0, 1], xycoords='axes fraction', xytext=[10, -10], textcoords='offset pixels', va='top') myplots.padded_limits(ax, xvals, yvals, .2) f.savefig(figfile.format('{1}_frac_double_{0}'.format(rname, rtype))) f.clear() colors = array(pfracs) colors = (colors - min(colors)) / (max(colors) - min(colors)) colors = colors[:, newaxis] * [1, 0, 0] f = myplots.fignum(3, figsize) xvals, yvals, pfracs, ugfracs = [], [], [], [] for i, vals in enumerate(zip(muts.values(), times.values())): mvals, tvals = vals xvals.append(energies[i]) frac_ug = metric(mvals, tvals, 'frac_ug')[0] pfrac, pinds, frac_good = metric(mvals, tvals, 'frac_paired') sfrac = metric(mvals, tvals, 'frac_silent')[0] ugfracs.append(frac_ug) pfracs.append(mean(pfrac) * frac_good) yvals.append(mean(sfrac) * frac_good) colors = array(pfracs) colors = (colors - min(colors)) / (max(colors) - min(colors)) colors = colors[:, newaxis] * [1, 0, 0] ax = f.add_subplot(111) ax.scatter(xvals, yvals, array(ugfracs) * 200, color=colors) ax.set_ylabel('mutation score') ax.set_xlabel('free energy (-kCal)') ax.annotate('''Evaluated structures positioned by energy and a mutation based evolutionary score. Color indicates a second score from paired BL. Radius indicates percentage of ungapped base pairs.''', [0, 1], xycoords='axes fraction', xytext=[10, -10], textcoords='offset pixels', va='top') myplots.padded_limits(ax, xvals, yvals, .2) f.savefig(figfile.format('{1}_frac_silent_{0}'.format(rname, rtype))) seq = structs['seq'] n, selection_type = [4, 'both'] idxs = get_interesting_inds(xvals, yvals, structs, energies, pfracs, rname, rtype, ns, rfid, figsize, colors, seq, n, selection_type) if draw_single: show_rna_structs(xvals, yvals, structs, energies, pfracs, rname, rtype, ns, rfid, figsize, colors, seq, n, selection_type, idxs) if draw_many: for n, selection_type in \ [[5,'ptime'],[5,'energy'],[ns,'energy']]: m_idxs = get_interesting_inds(xvals, yvals, structs, energies, pfracs, rname, rtype, ns, rfid, figsize, colors, seq, n, selection_type) show_rna_structs(xvals, yvals, structs, energies, pfracs, rname, rtype, ns, rfid, figsize, colors, seq, n, selection_type, m_idxs) return idxs, focus_tree