def show_clustered_expr(tge,tfe,tgnames, tfnames, nrml = True,fig = 8): f1 = plt.figure(fig) f2 = plt.figure(fig + 1) f1.clear() f2.clear() ax1 = f1.add_subplot(111) ax2 = f2.add_subplot(111) tgct = colors.getct(len(tgnames)) tfct = colors.getct(len(tfnames)) for i in range(len(tge)): ax1.plot(tge[i],color = tgct[i]) myplots.color_legend(f1,tgct,tgnames, ax = ax1,pos = 4) tstr = 'Target Expression Levels' if nrml: tstr += '(Normalized)' myplots.maketitle(ax1,tstr) for i in range(len(tfe)): ax2.plot(tfe[i],color = tfct[i]) myplots.color_legend(f2,tfct,tfnames, ax = ax2,pos = 4) tstr = 'TF Expression Levels' if nrml: tstr += '(Normalized)' myplots.maketitle(ax2,tstr)
def run(domain_name='X', projection_name='Y8'): prob2 = sio.loadmat('prob2.mat') domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12'] domains = [prob2.get(d) for d in domain_names] #domain_clusters = [prob2.get('ids_' + d) for d in domain_names] tissue_clusters = prob2.get('tissue_category') clusters = domain_clusters[domain_names.index(domain_name)] pdom = domains[domain_names.index(projection_name)] cdom = domains[domain_names.index(domain_name)] f = plt.figure(1) f.clear() random.seed(1) ct = array(mc.getct(218)) #px, py = 2, 2 sstrings = ['21{0:d}'.format(i + 1) for i in range(4)] inds = arange(shape(dom)[1]) c_inds = array(clusters).flatten() - 1 tc_inds = tissue_clusters.flatten() - 1 colors = ct[c_inds, :] ax = f.add_subplot(sstrings[0], title = \ 'Clusters from genespace affinity. Projection to first two elements') ax.scatter(*cdom[inds, 0:2].T, s=100, c=colors) ax = f.add_subplot(sstrings[1], title = \ 'Clusters from genespace affinity. Projection to MVE') ax.scatter(*pdom[inds, 0:2].T, s=100, c=colors) cpairs = set([ '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(c_inds) for iy, y in enumerate(c_inds) if ix < iy and x == y ]) tcpairs = set([ '{0:d}x{1:d}'.format(ix, iy) for ix, x in enumerate(tc_inds) for iy, y in enumerate(tc_inds) if ix < iy and x == y ]) f.savefig('figs/cluster_projectsions.tiff', format='tiff') max_pairs = (len(tc_inds) * len(tc_inds) - len(tc_inds)) / 2 total_pairs = len(cpairs.union(tcpairs)) shared_pairs = len(cpairs.intersection(tcpairs)) print 'using affinity propagation with affinites over domain {0}'.format( domain_name) print 'found' print ' max pairs: {0}'.format(max_pairs) print ' total pairs: {0}'.format(total_pairs) print ' tissue pairs: {0}'.format(len(tcpairs)) print ' cluster pairs: {0}'.format(len(cpairs)) print ' shared pairs: {0}'.format(shared_pairs) hg = hypergeom(len(tcpairs), len(cpairs), max_pairs) return hg
def plot_clusters(inds, embeddings, plot3d = False, title = '', ax_in =None, save = False, colors = None): exemplars = list(set(inds)) if colors == None: cluster_colors = dict([(exemplars[i], col) for i, col in enumerate(mycolors.getct(len(exemplars)))] ) else: cluster_colors = colors cols = [cluster_colors[e] for e in inds] try: if ax == None: plt.clf() except Exception, e: pass if ax_in == None: f = plt.gcf() for i, k in enumerate(embeddings.keys()): embedding = embeddings[k] #if i == 1: raise Exception() emb_sig = embedding[:,0:3] cluster_vars = array([ var(emb_sig[nonzero(equal(inds, j))[0]]) for j in exemplars]) indexed_vars = array([ cluster_vars[exemplars.index(j)] for j in inds ]) indexed_vars[equal(indexed_vars,0)] = 1 sizes = 10 *( exp( -1 * ( np.sum((emb_sig - emb_sig[inds,:])**2,1)/indexed_vars))) if plot3d: if ax_in == None: ax = f.add_subplot('{1}1{0}'.format(i+1, len(embeddings)),projection = '3d') else: ax = ax_in ax.scatter(array(embedding[:,0],float) ,array(embedding[:,1],float) ,array(embedding[:,2],float), s = sizes, color = cols) ax.set_xticks([]) ax.set_yticks([]) for tl in list(it.chain( ax.w_xaxis.get_ticklabels(), ax.w_yaxis.get_ticklabels(), ax.w_zaxis.get_ticklabels())): # re-create what autofmt_xdate but with w_xaxis tl.set_visible(False) tl.set_rotation(30) else: if ax_in == None: ax = f.add_subplot('{1}1{0}'.format(i+1, len(embeddings))) else: ax = ax_in ax.scatter(array(embedding[:,0],float) ,array(embedding[:,1],float), s = sizes, color = cols) print 'sttring' ax.set_title('{0} for subopts in {1}'.format(k, title)) if save: f.savefig(cfg.dataPath('cs874/figs/subopt_embeddings/{0}.ps').format(title))
def get_reinitz_data(**kwargs): ofs = kwargs.get('ofs',0) do_plot_coords = kwargs.get('plot_coords',False) do_plot_vals = kwargs.get('plot_vals',False) idm= id_map() df = datafiles(**mem.rc(kwargs)) #I'm not sure exactly how this dataset works but #each nuclei has a bunch of numbers that appear to be #monotonically increasing. # #I just take the first instance. nums = dict([(k,v[:,0]) for k, v in df.iteritems()]) nuc_count = len(set(nums.values()[2])) values = dict([(k,v[nuc_count *ofs: nuc_count *(ofs + 1),-1]) for k, v in df.iteritems()]) coords = dict([(k,v[nuc_count *ofs :nuc_count *(ofs + 1),1:3]) for k, v in df.iteritems()]) #to check the basic consistency of the data, enable the plot routines. #I suppose that I could do this for all of the nuclei occurences... #right now, only the first is used. if do_plot_coords: f = myplots.fignum(1,(8,8)) ax = f.add_subplot(111) ct = mycolors.getct(len(values)) for i,k in enumerate(values.keys()): ax.scatter(coords[k][:,0][::1], coords[k][:,1][::1], 10, edgecolor = 'none', alpha = .25,c =ct[i], label = k, ) f.savefig(myplots.figpath( 'reinitz_exprdata_coords_nuc_offset={0}'.format(ofs))) if do_plot_vals: f = myplots.fignum(1,(8,8)) ax = f.add_subplot(111) ct = mycolors.getct(len(values)) for i,k in enumerate(values.keys()): ax.scatter(coords[k][:,0][::1], values[k][::1], 10, edgecolor = 'none',alpha = .25,c =ct[i], label = k, ) f.savefig(myplots.figpath( 'reinitz_exprdata_ap_vals_nuc_offset={0}'.format(ofs))) return coords, values
def run( domain_name = 'X', projection_name = 'Y8' ): prob2 = sio.loadmat('prob2.mat') domain_names = ['X', 'XV', 'Y', 'Y8', 'Y12'] domains = [prob2.get(d) for d in domain_names] #domain_clusters = [prob2.get('ids_' + d) for d in domain_names] tissue_clusters = prob2.get('tissue_category') clusters = domain_clusters[domain_names.index(domain_name)] pdom = domains[domain_names.index(projection_name)] cdom = domains[domain_names.index(domain_name)] f = plt.figure(1) f.clear() random.seed(1) ct = array(mc.getct(218)) #px, py = 2, 2 sstrings = ['21{0:d}'.format(i+1) for i in range(4)] inds = arange(shape(dom)[1]) c_inds = array(clusters).flatten() -1 tc_inds = tissue_clusters.flatten() -1 colors = ct[c_inds,:] ax = f.add_subplot(sstrings[0], title = \ 'Clusters from genespace affinity. Projection to first two elements') ax.scatter(*cdom[inds,0:2].T,s= 100, c = colors) ax = f.add_subplot(sstrings[1], title = \ 'Clusters from genespace affinity. Projection to MVE') ax.scatter(*pdom[inds,0:2].T,s= 100, c = colors) cpairs = set(['{0:d}x{1:d}'.format(ix,iy) for ix, x in enumerate(c_inds) for iy, y in enumerate(c_inds) if ix < iy and x == y ]) tcpairs = set(['{0:d}x{1:d}'.format(ix,iy) for ix, x in enumerate(tc_inds) for iy, y in enumerate(tc_inds) if ix < iy and x == y ]) f.savefig('figs/cluster_projectsions.tiff',format = 'tiff') max_pairs =( len(tc_inds) * len(tc_inds) - len(tc_inds)) / 2 total_pairs = len(cpairs.union(tcpairs)) shared_pairs =len(cpairs.intersection(tcpairs)) print 'using affinity propagation with affinites over domain {0}'.format(domain_name) print 'found' print ' max pairs: {0}'.format(max_pairs) print ' total pairs: {0}'.format(total_pairs) print ' tissue pairs: {0}'.format(len(tcpairs)) print ' cluster pairs: {0}'.format(len(cpairs)) print ' shared pairs: {0}'.format(shared_pairs) hg = hypergeom( len(tcpairs), len(cpairs), max_pairs ) return hg
def family_exemplar_structs(rfid, refseq_method = None, sp_method = None, aff_type = None): suboptimals = rutils.family_suboptimals(rfid) c2 = rutils.cluster_2(spairs, ungapped_ref) arr = rutils.rna_draw(ungapped_ref.seq, rutils.pairs_stk(sp,len(ungapped_ref)), 'name' ) raise Exception() affinities, ss = rutils.affinity_matrix(spairs, aff_type = aff_type) aff_shape, ss_shape = rutils.affinity_matrix(spairs, aff_type = 'easy', ss_multiplier = .5) pca_vecs = mlab.PCA(affinities).project(affinities) pca_vecs_shape = mlab.PCA(aff_shape).project(aff_shape) inds = compute_clusters(aff_shape, ss_shape) exemplars = list(set(inds)) import compbio.utils.colors as mycolors ct = mycolors.getct(len(exemplars)) import matplotlib.pyplot as plt f = plt.gcf() plt.clf() for idx0, embeddings in enumerate([pca_vecs, pca_vecs_shape]): ax = f.add_subplot('21{0}'.format(idx0 +1)) lims =[ [min(embeddings[:,0]),max(embeddings[:,0])], [min(embeddings[:,1]),max(embeddings[:,1])] ] lims += [-.5,.5] *squeeze(diff(lims,1))[:,newaxis] ax.set_xlim(lims[0]) ax.set_ylim(lims[1]) print sum(embeddings) for idx, embedding in enumerate(embeddings): if mod(idx,1) != 0: continue sp = spairs[idx] arr = rutils.rna_draw(ungapped_ref.seq, rutils.pairs_stk(sp,len(ungapped_ref)), 'name' ) struct_emb = arr + embedding[0:2] #plt.plot(*struct_emb.T) pkw = {'color':ct[exemplars.index(inds[idx])], 'lw':8 if idx in inds else 1, 'alpha': 1 if idx in inds else .2} lc = rplots.show_rna(embedding, arr, pkw = pkw) #exemplar_structs = [spairs[e] for e in set(inds)] raise Exception() return pca_vecs, exemplar_structs
def cluster_2_show(clusters, polys): sortorder = argsort(clusters) ct_colors = mycolors.getct(len(set(clusters))) ct_dict = dict([(cluster, ct_colors[i]) for i, cluster in enumerate(set(clusters))]) plf2 = myplots.fignum(8,(10,10)) rplots.grid_rnas(polys[sortorder], colors = [ct_dict[i] for i in clusters[sortorder]], size = (5,5), dims = [180,50])
def p_m_correlation(): prots = nio.getBDTNP(protein = True) mrnas = nio.getBDTNP() matched = set(mrnas.keys()).intersection(set(prots.keys())) pairs = [(prots[k] , mrnas[k], k) for k in matched] f = plt.figure(0) f.clear() f.suptitle('mRNA and Protein Levels from BDTNP at six times in ~6000 cells', fontsize = 22) nx = ny = ceil(sqrt(len(pairs))) shp = shape(mrnas.values()[0]['vals']) colors = mycolors.getct(shp[1]) shr = None for i, p in enumerate(pairs): ax = f.add_subplot('{0:g}{1:g}{2:g}'.format(nx, ny , i+1), sharex = shr,sharey = shr) if not shr: shr = ax fbid = p[-1] #ax.set_title('{2}'.format(\ # fbid, nu.gene_symbol(fbid), tw.fill(nu.gene_biology(fbid), 75))) ax.grid(True, alpha = .2) ax.annotate(nu.gene_symbol(fbid),xy = [.02,.98], xycoords = 'axes fraction', size = 25, va = 'top') mu = corrcoef(p[0]['vals'][::,:].flatten(),p[1]['vals'][::,:].flatten()) ax.annotate('$\mu = {0:.2g}$'.format(mu[0,1]),xy = [.98,.98], xycoords = 'axes fraction', size = 25,ha = 'right', va = 'top') if mod(i, nx) >0: plt.setp( ax.get_yticklabels(), visible=False) else: ax.set_ylabel('mrna expression level') #plt.setp( ax.get_ylabel(), visible=False) if floor(i/nx) < (ny -1) : plt.setp( ax.get_xticklabels(), visible=False) else: ax.set_xlabel('protein expression level') #plt.setp( ax.get_xlabel(), visible=False) for j in range(shp[1]): ax.scatter(p[0]['vals'][::,j],p[1]['vals'][::,j], s = 20,alpha = .2,color = colors[j]) f.savefig(cfg.dataPath('figs/network/mrna_protein_levels.tiff', ),format = 'tiff')
def expr_getonoff(expr_in): expr = array(expr_in) dev = std(expr) #expr = log(expr+dev) k = 3 km = mlpy.Kmeans(k) n = len(expr) expr_2d = [] for i in range(n): expr_2d.append(array([expr[i],0])) expr_2d = array(expr_2d) comp = km.compute(expr_2d) means = km.means compsort = arange(k)[argsort(map(lambda x: x[0],means))] n = len(expr) xax = argsort(expr) means = zeros(k) stds = zeros(k) for i in range(k): idxs = nonzero(equal(comp,i))[0] vals = array(expr)[idxs] means[i] = mean(vals) stds[i] = std(vals) f = plt.figure(1) f.clear() ax = f.add_axes([0,0,1,1]) ct =mycolors.getct(k) cs, rs = [], [] for i in range(n): cs.append(ct[compsort[comp[i]]]) rs.append(100) ax.scatter(xax,expr,rs, color = cs) x0 = 0 y0 = 0 for i in range(k): ax.plot([x0,x0],[means[i] - stds[i], means[i]+ stds[i]] ,linewidth = 5, color = ct[compsort[i]])
def show_binary(idx = 0): tsb = nu.expr_TS_binary(reset = 0) tsvals = nu.load_TS() net = nu2.get_net() tgs = net[1] tfs = net[0] f = plt.figure(0) f.clear() ax = f.add_subplot(111) for k in tsb.keys()[idx:]: my_tfs = tgs.get(k,[]) ct = mycolors.getct(len(my_tfs)) tgseries = tsvals[k] if not my_tfs: continue for i in range(len(my_tfs)): tf = my_tfs[i][0] series = tsvals.get(tf) if not series: continue binary = tsb.get(tf) #if not binary: # print 'no ts for ' + tg # continue npts = len(binary) xax = tgseries cmap = equal(binary,0)[:,newaxis]*[1,0,0] + equal(binary,1)[:,newaxis]*[0,1,0] print my_tfs[i][1] ax.scatter(xax, series, 500, color = cmap, alpha = my_tfs[i][1], edgecolor = '0') break return
def tree_similarity(dist1, dist2, run_id,criterion = 'knn', k = 6): if criterion == 'knn': nq = len(dist1) nb1 = argsort(dist1, 1)[:,1:k+1] nb2 = argsort(dist2, 1)[:,1:k+1] all_nbs = [set(n1).union(set(n2)) for n1, n2 in zip(nb1, nb2)] nb_intersection = [set(n1).intersection(set(n2)) for n1, n2 in zip(nb1, nb2)] nb_dists = [ array([[dist1[i, n], dist2[i,n]]for n in nbs ]) for i,nbs in enumerate(all_nbs)] #take the first k distances. n_disagreements = [len(nbd) - k for nbd in nb_dists] nb_dists = array([ sorted(nbd, key = lambda x: min(x))[:k] for nbd in nb_dists]) frac_diffs = [abs(diff(elt, 1).flatten()) / mean(elt,1) for elt in nb_dists] abs_diffs = [abs(diff(elt, 1).flatten()) for elt in nb_dists] ct = mycolors.getct(nq) f = myplots.fignum(4, (10,8)) ax = f.add_axes([.05,.08,.25,.87]) seismic.seismic(abs_diffs, ax = ax, colors = ct) jaccard = mean([float(len(nb_intersection[i])) / float(len(all_nbs[i])) for i in range(nq)]) ax2 = f.add_axes([.34,.08,.6,.87]) for i,d in enumerate(nb_dists): ax2.scatter(d[:,0], d[:,1], 20, alpha = .5,color =ct[i]) lin = linregress(nb_dists[:,:,0].flatten(),nb_dists[:,:,1].flatten()) rsquared = lin[2]**2 ax2.annotate('NN dists for multi/struct-aligned trees.\nK = {0}'.format(k), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax2.annotate('R-Squared: {0:3.3}\nJaccard Index: {1:3.3}'.format(rsquared, mean(jaccard)), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax2.set_xlabel('Muscle aligned tree distances') ax2.set_ylabel('Struct algined tree distances') datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_dists_{0}_k{1}.tiff'.format(run_id, k)) f.savefig(datafile)
def draw2d(self): f = plt.figure(self.fig) f.clear() ax = f.add_axes([.05, .05, .9, .9]) data_x = 0 data_y = 1 ct = mycolors.getct(self.nm) xs, ys, rs, cs = [[] for i in range(4)] for i in range(self.nd): xs.append(self.data[i][data_x]) ys.append(self.data[i][data_y]) rs.append(25) cs.append(ct[self.labels[i]]) for i in range(self.nm): xs.append(self.means[i][data_x]) ys.append(self.means[i][data_y]) rs.append(100) cs.append([0, 0, 0]) ax.scatter(xs, ys, rs, cs)
def show_multi(timepoint = -1): mrnas = nio.getBDTNP() misc = nio.getBDTNP(misc = True) shp = shape(mrnas.values()[0]['vals']) #choose to look only at one timepoint stds = [std(m['vals'][:,timepoint]) for m in mrnas.values()] f = plt.figure(0) try: f.clear() except Exception, e: print 'hi' ax = f.add_subplot(111, projection = '3d') vsort = argsort(stds)[::-1] n = 10 colors = mycolors.getct(n) for i in arange(n): step = argmax(np.sum(mrnas.values()[vsort[i]]['vals'],0)) show_3d(mrnas.keys()[vsort[i]], step = step, skip = 20, ax = ax, ofs =10*random.rand(3), color = colors[i])
def draw_xy(xset, yset): nx = shape(xset)[0] nt =shape(xset)[1] ct = colors.getct(nx) f2 = plt.figure(1) f2.clear() ax2 = f2.add_axes([0,0,1,1]) xs, ys, rs, cs = [], [], [], [] for i in range(nx ): feature = xset[i] fmax = max(feature) for t in range(nt): xs.append(feature[t]/fmax) ys.append(yset[t]) rs.append(20) cs.append(ct[i]) ax2.scatter(xs,ys,rs,cs) f2.show()
def makePlots(self, name="No Name"): xtrain, ytrain = self.xyTrain() xtest, ytest = self.xyTest() ytrain_predicted = self.predictTraining() ytest_predicted = self.predictTest() ny = len(ytrain) f = plt.figure(1) f.clear() ax0 = f.add_subplot("211") f1 = plt.figure(2) f1.clear() ax1 = f1.add_subplot("211") ct = mycolors.getct(ny) for actual, predicted, ax, subtitle in [ [ytest, ytest_predicted, ax0, "test predictions"], [ytrain, ytrain_predicted, ax1, "training predictions"], ]: for i in range(len(actual)): lplots.plotPredictions(actual[i], predicted[i], ax, color=ct[i]) myplots.maketitle(ax, name, subtitle=subtitle)
def view3(): files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l] fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files] ids = [l[0:10] for l in files] inps = [butils.load_data(i, "input") for i in ids] idxs_good = nonzero(greater([elt.get("out_iter_num") for elt in inps], 2))[0] inps = [inps[i] for i in idxs_good] fpaths = [fpaths[i] for i in idxs_good] fig = myplots.fignum(3, (35, 15)) ax = fig.add_axes([0, 0, 1, 1]) for f, inp in zip(fpaths, inps): if inp["out_iter_num"] == 2: continue print inp["filename"] data = sio.loadmat(f) import compbio.utils.colors as mycolors ct = mycolors.getct(len(data["gene_names"])) term_list = [list(it.chain(*mod)) for mod in data["model"]] fac_list = [list(it.chain(*t)) for t in term_list] xvals, yvals, colors, rads = [], [], [], [] for i, terms in enumerate(term_list): for j, term in enumerate(terms): for k, fact in enumerate(term): xvals.extend([i] * len(term)) yvals.extend([fact] * len(term)) colors.extend([ct[c] for c in sorted(term)]) rads.extend(((arange(1, len(term) + 1) ** 2) * 50)[::-1]) vecs = zeros((len(fac_list), len(fac_list))) for i, fl in enumerate(fac_list): for f in fl: vecs[i, f] = 1 # plt.imshow(vecs) # ax1 = fig.add_subplot(121) # ax2 = fig.add_subplot(122) import hcluster clusters = hcluster.fclusterdata(vecs, 1.1, criterion="inconsistent", method="complete") # ax1.imshow(vecs) # ax2.imshow(vecs[argsort(clusters)]) # raise Exception() csrt = argsort(argsort(clusters)) xvals2 = [csrt[x] for x in xvals] # raise Exception() plt.scatter(xvals2, yvals, rads, color=colors) raise Exception() raise Exception()
def viewmany(all_means, all_clusters, fig = 12): n = len(all_means) f = plt.figure(fig) f.clear() print '''Running viewmany.py For now, viewmany assumes that k is equal across clustering instances this is not really important but has to do with how TF projections are stored. ''' #1 k. k = len(all_means[0]) ax1 = f.add_axes([.05,.05,.95,.4]) ax2 = f.add_axes([.05,.55,.95,.4]) ct0 = mycolors.getct(n) sqa = nu.net_square_affinity()[0] aff = nu.net_affinity()[0] #tf_sqidxs should have length = ntf #with each element giving the coordinate of the #i'th tf in sqa space. sqidxs = nu.net_sq_keyidxs() n_tfidxs = nu.net_tf_keyidxs() trgs,tfs = nu.parse_net() tf_sqidxs = [sqidxs[key] for key in tfs.keys()] tfidxs = n_tfidxs.values() ntf = len(tfidxs) tfweights = zeros(ntf,int) #find tfs of general interest, choosing at most ten for each clustering ntf_each = 20 print '''...Computing representative TFs for each clustering. In the current formulation, we project each mean on to associated tf and then normalize each projection so that each mean has equal weight in TF selection. Not that we have handled the case where we have clusted in TF space explicitly (e.g, dim = 541) and where we are in gene space explicitly, (e.g., dim = 8321, GG matrix or svdU). svdV is emphatically not handled. Neither would svdU of TF-TF which is actually the the exact same thing.''' TFprojs= zeros((n,k,ntf)) for i in range(n): m = all_means[i] dim = shape(m)[1] #we are now going to project clusters on to the tfs #in this form, we only need rows corresponding to tfs. if dim> 500: #If dim = 541, we just read off the most important tfs this_tf_sum = np.abs(m[:,tfidxs]) TFprojs[i,:,:] = this_tf_sum #normalize clusters this_tf_sum = this_tf_sum / np.sum(this_tf_sum,1)[:,newaxis] this_tf_sum = np.sum(this_tf_sum,0) #Now, since we are at the moment only working with GG #and SVD_U, we are in gene space and can undo the mapping #with sqaT elif dim > 8000: #remember, ROWS of the matrix correspond to the #target space. a = sqa.T[tf_sqidxs,:] this_tf_sum = np.abs(np.sum(a[newaxis,:,:]*m[:,newaxis,:],2)) TFprojs[i,:,:] = this_tf_sum #normalize so that each mean has the same weight this_tf_sum = this_tf_sum / np.sum(this_tf_sum,1)[:,newaxis] #sum over cluster means to find the most important tfs this_tf_sum = np.sum(this_tf_sum,0) best = argsort(this_tf_sum)[::-1] tfweights[best[0:ntf_each]]=1 print '''Finished computing representative TFs ''' tfs_of_interest = nonzero(tfweights)[0] ntf = len(tfs_of_interest) avg_unshared = float(ntf)/(n * ntf_each) avg_shared = 1. - float(ntf)/(n * ntf_each) print '''Allowing for each cluster to choose '+str(ntf_each) + 'tfs, we got ''' + str(ntf) + ''' tfs of interest. or a mean sharing ratio of ''' + str(round(avg_shared,3))+ '''.''' #get a color table for clusters. ct = mycolors.getct(n) for i in range(n): #p stands for 'point' as in datapoint. #data points are labeled with clusters. xax = linspace(0,1,ntf) ax1.plot(xax,np.sum(TFprojs[i,:,tfs_of_interest],1)/np.max(TFprojs[i,:,tfs_of_interest],1),color = ct[i]) return TFprojs
def draw_cm_muscle_congruencies(seqs, profiles, run_id, reset = True): print 'computing alignments...' print ' ...using muscle' malis, mrefs, mpairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'muscle', reset = reset, on_fail = 'compute', register = 'tuali_musc_{0}'.format(run_id))) print ' ...using cmalign.' salis, srefs, spairs =\ mem.getOrSet(setAlignments, **mem.rc({}, seqs = seqs, profiles = profiles, run_id = run_id, ali_type = 'struct', reset = reset, on_fail = 'compute', register = 'tuali__struct_{0}'.format(run_id))) print ' ...making trees.' for idx, alis in enumerate(zip(malis, salis)): m, s = alis mtree = phyml.tree(m,run_id, bionj = True) stree = phyml.tree(s,run_id, bionj = True) maps = dict([(elt.id,i) for i, elt in enumerate(m)]) mdists = zeros((len(maps),len(maps))) sdists = zeros((len(maps),len(maps))) for n1 in mtree.get_terminals(): for n2 in mtree.get_terminals(): mdists[maps[n1.name],maps[n2.name]] = \ mtree.distance(n1,n2) for n1 in stree.get_terminals(): for n2 in stree.get_terminals(): sdists[maps[n1.name],maps[n2.name]] = \ stree.distance(n1,n2) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = len(sdists - 1)) tree_similarity(sdists, mdists, '{0}_struct_{1}'.format(run_id,idx), k = 6) f = myplots.fignum(4, (8,10)) ct = mycolors.getct(len(mtree.get_terminals())) import networkx for t, sp, ttype in zip([mtree, stree], [211,212], ['sequence', 'structural']): a = f.add_subplot(sp) layout = 'neato' G = phylo.to_networkx(t) Gi = networkx.convert_node_labels_to_integers(G, discard_old_labels=False) posi = networkx.pygraphviz_layout(Gi, layout, args = '') posn = dict((n, posi[Gi.node_labels[n]]) for n in G) networkx.draw(G, posn, labels = dict([(n, '') for n in G.nodes()]), node_size = [100 if n.name in maps.keys() else 0 for n in G.nodes()], width = 1, edge_color = 'black', ax = a, node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] ) a.annotate('Embedded tree for {0} alignment.'.format(ttype), [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,0],textcoords = 'offset pixels') a.annotate('Total branch length is {0}'.format(t.total_branch_length()), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') #phylo.draw_graphviz( mtree, label_func = lambda x: '', # node_color = [ct[maps.get(n.name, -1)] for n in G.nodes()] +\ # [ct[0] for n in mtree.get_nonterminals()], axes = ax) datafile = cfg.dataPath('figs/gpm2/pt2_mus_cm_tree_embeddings_{0}_struct_{1}.ps'.format(run_id, idx)) f.savefig(datafile, dpi = 200, format = 'ps')
def view4(): files = [l for l in os.listdir(cfg.dataPath("batch/tmp")) if "mcmc" in l] fpaths = [os.path.join(cfg.dataPath("batch/tmp"), f) for f in files] ids = [l[0:10] for l in files] inps = [butils.load_data(i, "input") for i in ids] idxs_good = nonzero(greater([elt.get("out_iter_num") for elt in inps], -1))[0] inps = [inps[i] for i in idxs_good] fpaths = [fpaths[i] for i in idxs_good] termgroups, cnames, xvals, gvals, yvals, colors, rads, tfs, all_coefs = [], [], [], [], [], [], [], [], [] l_info = {} for l, elt in enumerate(zip(fpaths, inps)): f, inp = elt if inp["out_iter_num"] == 2: continue print inp["filename"] clustname = re.search(re.compile("_([^_]+)\.mat"), inp["filename"]).group(1) cnames.append(clustname) l_info[l] = {} l_info[l]["cname"] = clustname l_info[l]["filename"] = inp["filename"] data = sio.loadmat(f) l_info[l]["stay_same"] = data["stay_same"] l_info[l]["improve_ratio"] = data["improve_ratio"] l_info[l]["error_test"] = data["error_test"] import compbio.utils.colors as mycolors ct = mycolors.getct(len(data["gene_names"])) term_list = [list(it.chain(*mod)) for mod in data["model"]] fac_list = [list(it.chain(*t)) for t in term_list] seen = set() all_coefs.append(data["coefs_dic_nonlinear"]) coefs = data["coefs_dic_nonlinear"] nlcof_all = open( cfg.dataPath("network/network_predmodel/regressionwts/nonlinear_all/nw_{0}.sif".format(l)), "w" ) nlcof_sing = open( cfg.dataPath("network/network_predmodel/regressionwts/nonlinear_sing/nw_{0}.sif".format(l)), "w" ) tfnames = data["tf_names"] tgnames = data["gene_names"] for i, terms in enumerate(term_list): if i in (5, 49, 53, 30, 17, 8, 38): if sum(terms) > 0: raise Exception() terms = [t - 1 for t in terms] for j, term in enumerate(terms): if len(term) == 1: wt = coefs[i][0][0][j] nlcof_sing.write("{0}\t{1}\t{2}\n".format(tfnames[term][0][0], tgnames[i][0], wt)) for k, fact in enumerate(list(set(term))): wt = coefs[i][0][0][j] nlcof_all.write("{0}\t{1}\t{2}\n".format(tfnames[fact][0][0], tgnames[i][0][0], wt)) gvals.append([i] * (len(term) + 1)) yvals.append([fact] * (len(term) + 1)) colors.append([ct[c] for c in sorted(term)] + [1, 1, 1]) tfs.append([c for c in sorted(term)]) rads.append(((arange(1, len(term) + 2) ** 2) * 50)[::-1]) xvals.append([l] * (len(term) + 1)) nlcof_all.close() nlcof_sing.close() return cnames, xvals, gvals, yvals, colors, rads, l_info, tfs, coefs
def show_output(outputs, show = 'conservation', save = True): mvecs = outputs['all_vecs']['all_time'] tvecs = outputs['all_vecs']['all_mut'] fvecs = outputs['all_vecs']['fiftyfifty'] run_id = outputs['run_id'] structs = outputs['exemplar_structs'] ref = outputs['reference_seq'] thermo_pairs = outputs['thermo_pairs'] thermo_inds = outputs['thermo_ex_inds'] run_title = outputs['title'] fam_name = re.compile('RF\d*').search(run_title).group() fig = plt.gcf() try: fig.clear() except Exception, e: print 'wonky 3d bug' fig = plt.gcf() try: fig.clear() except Exception, e: print 'wonky 3d bug' fig.canvas.draw() exemplar_inds = sorted(list(set(thermo_inds))) struct_colors = dict([(exemplar_inds[i], col) for i, col in enumerate(mycolors.getct(len(exemplar_inds)))] ) if show == 'embeddings': exemplars = list(set(thermo_inds)) pair_embedding = compute_embedding(thermo_pairs, aff_type = 'pairs', do_mve = False, ss_multiplier = None) shape_embedding = compute_embedding(thermo_pairs, aff_type = 'easy', do_mve = False, ss_multiplier = None) show_3d = True #shape_embedding[0] is pca rplots.plot_clusters( thermo_inds, {'shape':shape_embedding[0], 'pairs':pair_embedding[0]}, plot3d = show_3d, title = 'projection ({0}) '.format(run_id), save = save, colors = struct_colors) elif show == 'conservation': ax0 = fig.add_subplot('311') lstructs = [project_lstruct(p, len(ref)) for p in structs] seismic.seismic([ abs(l) for l in lstructs] , colors = struct_colors.values(), ax = ax0) myplots.maketitle(ax0, 'Predicted conservation patterns for {0}'.format(fam_name)) shapes = array([shape(m) for m in mvecs]) igood = nonzero(greater(shapes[:,1],0))[0] clade_colors = mycolors.getct(len(igood)) mvg, tvg, fvg = [ [vecs[i] for i in igood] for vecs in [mvecs,tvecs,fvecs]] cons_types = array([ mvg, tvg, tvg]) for c in cons_types: nrm = sum(c.flatten()) if nrm == 0: nrm = 1 c /= sum(c.flatten()) if shape(cons_types)[1] == 0: print 'No good vectors!' return mtype_sums = np.sum(np.sum(cons_types,3),0) stype_sums = np.sum(np.sum(cons_types,3),0).T ax1 = fig.add_subplot('312') seismic.seismic(stype_sums , colors = struct_colors.values(), ax = ax1) myplots.maketitle(ax1,'Observed conservation (struct v. clade) patterns for {0}'\ .format(fam_name), ) ax2 = fig.add_subplot('313') seismic.seismic(mtype_sums , ax = ax2, colors = clade_colors, stacked = True, label_y = False) #myplots.maketitle(ax2, 'Observed conservation (clade v. struct) patterns for {0}'\ # .format(run_title) # ) ax2.annotate('Observed conservation (clade v. struct) patterns for {0}'\ .format(run_title), [.5,0],xycoords = 'axes fraction', ha = 'center', va = 'top', size = 'x-large') if save: fig.savefig(cfg.dataPath('cs874/figs/cons_profiles/{0}.ps'.format(run_title))) else: raise Exception('show type not implemented: {0}'.format(show))
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True, draw_distances = draw_all_easy, draw_clusters = draw_all_easy, draw_single_cluster = draw_all_hard): ''' Run the tree computation for each clsuter in the rfam family. (Or just one) 1) Compute clusters using a distance measure derived either phyml or a simple levenshtein dist. kwds: tree [True] Use a tree or just a levenshtein distance to get distances for init clustering. 2) Choose a cluster of well related sequences and for this this cluster, compute an alignment (For each structure using phase or for sequences using MUSCLE) kwds: struct_align [True] Whether to compute structural alignments or use MUSCLE ''' rutils = utils ali, tree, infos = rfam.get_fam(rfid) n = len(ali) if draw_distances: dists_t = seq_dists(ali,rfid, tree = True) dists_l = seq_dists(ali,rfid, tree = False) dtf = dists_t.flatten() dlf = dists_l.flatten() lin = linregress(dtf, dlf) rsquared = lin[2]**2 f = myplots.fignum(5, (7,7)) ax = f.add_subplot(111) ax.annotate('Levenshtein distance vs. BioNJ branch lengths', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('R-Squared: {0}'.format(rsquared), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('BIONJ Tree ML Distance') ax.set_ylabel('Levenshtein Distance') ax.scatter(dtf, dlf, 100) datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff') f.savefig(datafile) dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid, register = rfid, on_fail = 'compute', reset = reset) clusters = maxclust_dists(dists, k = 5, method = 'complete') clusters -= 1 if draw_clusters: ct = mycolors.getct(len(set(clusters))) colors = [ct[elt] for elt in clusters] pca_vecs = mlab.PCA(dists).project(dists) f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of Clusters: {0}'.format(len(ct)), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 1') ax.set_ylabel('PC 2') ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors) datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps') f.savefig(datafile) #now take the largest cluster and do the analysis. cgrps = dict([ (k, list(g)) for k , g in it.groupby(\ sorted( list(enumerate(clusters)),key = lambda x: x[1]), key = lambda x: x[1])]) cbig = argmax([len(x) for x in cgrps.values()]) cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] csize = len(cluster_seqs) seqs =[ali[c] for c in cluster_seqs] if 0: ct = mycolors.getct(2) pca_vecs = mlab.PCA(dists).project(dists) colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] f = myplots.fignum(5, (8,8)) ax = f.add_subplot(111) ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.', [0,1], xycoords = 'axes fraction', va = 'top', xytext = [10,-10],textcoords = 'offset pixels') ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n - csize), [1,0], xycoords = 'axes fraction', ha = 'right', xytext = [-10,10],textcoords = 'offset pixels') ax.set_xlabel('PC 0') ax.set_ylabel('Distance') for s in cluster_seqs: ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2), color = colors, alpha = .2) datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps') f.savefig(datafile) clusters_final = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))] seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final] return seqs_final
def cluster_tissues(nx = 20,ny = 500, timepoint = -1, step = 4, sim = 'neg_dist', imshow_sims = False, scatter_sims = False, hist_sims = False, do_cluster= True, do_show = True, cstep = -1): '''Cluster ny nuclei by the values of the nx mRNAs with highest variance. Uses the medioids method with number of clusters set by exemplar self simalarity as outlined in 6.874 and implemented at http://www.psi.toronto.edu/affinitypropagation imaging: imshow_sims scatter_sims hist_sims do_show numerics: nx: number of genes to cluster upon ny: number of cells in the clusterin timepoint: which time to use for cluster computation step: how many genes to skip when showing results So far I have implemented a distance based similarity and a ''' mrnas = nio.getBDTNP() misc = nio.getBDTNP(misc = True) shp = shape(mrnas.values()[0]['vals']) #choose to look only at one timepoint stds = [std(m['vals'][:,timepoint]) for m in mrnas.values()] vsort = argsort(stds)[::-1] xinds = vsort[:nx] #Choose the most variable factors and use them as the #underlying variables from which to construct a similarity nuclei =array([ mrnas.values()[idx]['vals'][:,timepoint] for idx in xinds]).T t = [ mean(nuclei, 0), std(nuclei,0)] t[1][equal(t[1],0)] = 0 sims = similarity(nuclei, transform = t, method = sim) cluster_inds = array(floor(linspace(0,len(nuclei)-1, ny)), int) cluster_training = sims[cluster_inds,:][:,cluster_inds] f = plt.figure(0) #, projection = '3d') if scatter_sims: ax = f.add_subplot(111) scatterx = [cluster_sims[i] for i in range(ny) for j in range(ny)] scattery = [cluster_sims[j] for i in range(ny) for j in range(ny)] ax.scatter(scatterx, scattery, s =3, alpha = .1) if imshow_sims: ax = f.add_subplot(111) cmap = mycolors.blackbody() ax.imshow(cluster_sims, cmap = cmap, interpolation = 'nearest') if hist_sims: ax = f.add_subplot(111) csf = cluster_sims.flatten() csf -= max(csf) csf *= -1 h = histogram(log10(1+csf), bins = 100) ax.plot(h[1][:-1],h[0]) cluster(cluster_training, ss.scoreatpercentile(cluster_training,.2) ) fopen = open(cfg.dataPath('bdtnp/clustering/nuclei/idxs')) lines = fopen.readlines() c = [int(l.strip()) for l in lines] c_training_exemplars = set(c) exemplar_inds = [cluster_inds[i] for i in c_training_exemplars] #I am being a bit lazy with subscripting here because I just assume #that the similarity is symmetric... I suppose I could let it be #asymmetric if I liked exemplars = nuclei[exemplar_inds,:] all_sims = similarity(nuclei, exemplars, transform = t, transform_exemplars = True, method = sim) assignments = np.argmax(all_sims,1) ne = len(c_training_exemplars) colors = array(mycolors.getct(len(c))) colors = array(colors) if do_show: for tp in range(shape(mrnas.values()[0]['vals'])[1])[-1:]: try: f.clear() except Exception, e: print 'Weird 3d plotting error. Alas' nuclei =array([ mrnas.values()[idx]['vals'][:,tp] for idx in xinds]).T all_sims = similarity(nuclei, exemplars, transform = t, transform_exemplars = True, method = sim) assignments = np.argmax(all_sims,1) ax = f.add_subplot(111) #colors = [colors[i] for i in c] xs = misc['x']['vals'][::step,0] ys = misc['y']['vals'][::step,0] zs = misc['z']['vals'][::step,0] ax.scatter(xs, zs,s= 50, color =colors[assignments[::step]]) #ax.set_title('''Virtual embryo cell (n={2}) clusters #from similarities derived from {0} genes. #Clusters derived at T = {1}, shown at T = {3}.'''\ # .format(nx,timepoint, len(xs),tp)) f.savefig(cfg.dataPath('figs/bdtnp/cluster_movie{0:02d}.tiff'.format(tp)), format = 'tiff')
def cluster_exprs(all_members, ct_data, do_plot = False, cluster_type = '4d', cluster_id = 4): mrnas = nio.getBDTNP() misc = nio.getBDTNP(misc = True) c = all_members[cluster_id] c_unq = set(list(c)) tissues = dict([('t_{0}'.format(i) , dict(cts = ct_data[equal(c,elt)])) for i, elt in enumerate(c_unq)]) nt = 6 counts = array([[sum(equal(v['cts'][:,1],t)) for t in range(nt) ] for v in tissues.values() ]) if do_plot: f = plt.figure(1) f.clear() ax1 = f.add_subplot('121') ax2 = f.add_subplot('122') seismic.seismic(counts , ax = ax1,stacked = True,colors = mycolors.getct(len(counts))) #seismic.seismic(np.sort(counts,0) , ax = ax2,stacked = False,colors = mycolors.getct(len(counts))) ax2.hist(np.sum(counts,1)) all_exprs = {} for t, v in tissues.iteritems(): ct_all = v['cts'] for time in set([c[1] for c in ct_all]): ct = [ct for ct in ct_all if ct[1] == time] exprs =dict( [(k,elt['vals'][zip(*ct)]) for k, elt in mrnas.iteritems()]) ys = misc['y']['vals'][zip(*ct)] #zip(*sim_xy)] zs = misc['z']['vals'][zip(*ct)] #zip(*sim_xy)] xs = misc['x']['vals'][zip(*ct)] #zip(*sim_xy)] f = plt.figure(1) f.clear() ax1 = f.add_subplot('121', title = 'X-Z axis view for tissue {0}'.\ format(t)) ax2 = f.add_subplot('122',title = 'Y-Z axis view for tissue {0}'.\ format(t)) ax1.scatter(xs, zs) ax2.scatter(ys, zs) v['exprs'] = exprs all_exprs['tiss_{0}_time_{1}'.format(t,time)]=exprs sio.savemat(open(cfg.dataPath('soheil/expression_c{0}_n{1}_tissue{2}_time{3}.mat'.\ format(cluster_type,cluster_id,t,time)),'w'), exprs) f.savefig(open(cfg.dataPath('soheil/expression_c{0}_n{1}_tissue{2}_time{3}.tiff'.\ format(cluster_type,cluster_id,t,time)),'w')) exprs_out = dict([( k, [ mean(sub[k]) for sub in all_exprs[k].values() ]) for k in all_exprs.keys() ]) sio.savemat(open(cfg.dataPath('soheil/expression_c{0}_n{1}_intercluster.mat'.\ format(cluster_type,cluster_id)),'w'), exprs_out) raise Exception()
def c2( launcher = None, ncluster =2000, host = 'tin', reset = 0, step = 10, exemp_time = 'all', doplot = False ,**kwargs): mrnas = nio.getBDTNP() misc = nio.getBDTNP(misc = True) vals = array([v['vals'] for v in mrnas.values()]) gvars = var(vals, 1) gminvars = np.min(gvars,1) gmedvars = median(gvars,1) min20 = argsort(gminvars)[::-1][:20] med20 = argsort(gmedvars)[::-1][:20] int20 = set(min20).intersection(set(med20)) xgenes = array(list(int20)) cell_data = vals[xgenes].transpose(1,2,0) scd = shape(cell_data) #times = reshape(zeros(shape(cell_data[0:2]))[:,:,newaxis , arange(shape(cell_data[1])) # , (prod(shape(cell_data)[0:2]))) xycoords = (arange(scd[0])[:,newaxis,newaxis]*[1,0] +\ arange(scd[1])[newaxis,:,newaxis]*[0,1]) cell_data = reshape(cell_data, (prod(shape(cell_data)[0:2]), shape(cell_data)[2] )) xy_data = reshape(xycoords, (prod(scd[0:2]),2 )) if exemp_time == 'all': inds = arange(len(cell_data)) else: inds = arange(len(cell_data))[nonzero(equal(xy_data[:,1],exemp_time))[0]] np.random.seed(1) np.random.shuffle(inds) rand_thousand = inds[0:ncluster] sim_data = cell_data[rand_thousand] sim_xy = xy_data[rand_thousand] t = [ mean(sim_data, 0), std(sim_data,0)] t[1][equal(t[1],0)] = 0 metric = 'neg_dist' sims = similarity(sim_data, transform = t, method = metric) name = 'll_{0}_{1}_{2}'.format(metric,ncluster,exemp_time) def setLauncher(**kwargs): sims= kwargs.get('sims') metric = kwargs.get('metric') name = kwargs.get('name') d_in = [] percs = logspace(.1,1.5,8) for p in percs: d_in.append(dict(similarities = sims, self_similarity = ss.scoreatpercentile(sims, p), metric = metric )) launcher = bcl.launcher(d_in, host = host, name = name) return launcher if launcher == None: output = mem.getOrSet(setLauncher, **mem.rc(dict(sims = sims, metric = metric, name = name, hardcopy = True, reset = reset, hard_reset = False,))) return output def setC2(launcher = launcher, **kwargs): if launcher == None: raise Exception() else: output = launcher.output() return output #It appears that the bsub process failed for the first output. #No big deal. Debug later. output = mem.getOrSet(setC2, **mem.rc(dict(harcopy = True, launcher = launcher, reset = reset, on_fail = 'compute', hard_reset = False, name = 'c2'+ name ))) all_inds = array([ squeeze(o['inds']) for o in output[:] ]) xs = misc['x']['vals'][zip(*xy_data)] #zip(*sim_xy)] ys = misc['y']['vals'][zip(*xy_data)] #zip(*sim_xy)] zs = misc['z']['vals'][zip(*xy_data)] #zip(*sim_xy)] colors =array( mycolors.getct(shape(all_inds)[1]) ) f = plt.figure(0) f.clear() all_tps = range(scd[1]) nc = len(all_inds) nt = len(all_tps) all_members = [] for i, inds in enumerate(all_inds): #compute similarity matrices 1000 at a time: exemplars = sim_data[list(set(list(inds)))] sim = similarity(cell_data, exemplars, transform = t, method = metric) closest = argmax(sim, 1) all_members.append(closest) if doplot: for j, tp in enumerate(all_tps): ax = f.add_axes( [float(j)/nt,float(i) /nc,1./nt, 1. /nc] ) ax.set_yticks([]) ax.set_xticks([]) i_sub = nonzero(equal(xy_data[:,1], j) * greater(ys,0))[0] cs = colors[closest[i_sub]] x = xs[i_sub] z = zs[i_sub] plt.scatter(x[::step],z[::step], 40,alpha = .75, c = cs[::step], edgecolor = 'none') ct_data = xy_data return all_members, ct_data
def one(all_means, all_mems, tfp, axis = 'tf', idxs = [0,1], fig = 5 ,choice_ax = 'x' ,nrml = 'axis' ,sorting = 'axis'): m = all_means[idxs[0]] c = all_mems[idxs[0]] proj=abs(tfp[idxs[0],:,:]) m2 = all_means[idxs[1]] c2 = all_mems[idxs[1]] proj2=abs(tfp[idxs[1],:,:]) sqidxs = nu.net_sq_keyidxs() n_tfidxs = nu.net_tf_keyidxs() trgs,tfs = nu.parse_net() tf_sqidxs = [sqidxs[key] for key in tfs.keys()] gene_sqidxs = [sqidxs[key] for key in trgs.keys()] tfk = nu.net_tf_keyidxs() tgk = nu.net_trg_keyidxs() tf_aidx = [ tfk[key] for key in tfs.keys()] gene_aidx = [ tgk[key] for key in trgs.keys()] tfidxs = tf_aidx k = len(m) ntf = len(tf_sqidxs) ng = len(gene_sqidxs) print '''Getting ready to plot clusters mapped on to tf components. --note-- In its current incarnation, netutils orders tfs by their out degree and genes by their in degree. Thus viewmany() orders projects by TF out degree. Left unsorted, this is the order of the TF x axis.''' #how to normalize the image? #axis: equal sum for each tf over all clusters. #other: equal sums for each cluster in img nrml = 'axis' nrml_type = lambda x,y:np.max(x,y) sorting = 'other' print axis d0 = shape(m)[1] d2 = shape(m2)[1] show_membership = True if axis == 'tf': if sorting == 'axis': img = proj mean_tfval = argmax(img,1) c_srt = np.argsort( mean_tfval) img = img[c_srt,:] img2 = proj2 mean_tfval = argmax(img2,1) c_srt = np.argsort( mean_tfval) img2 = img2[c_srt,:] else: img = proj mean_tfval = argmax(img,0) c_srt = np.argsort( mean_tfval) img = img[:,c_srt] img2 = proj2 mean_tfval = argmax(img2,0) c_srt = np.argsort( mean_tfval) img2 = img2[:,c_srt] elif axis =='gene': maxgene = 200 gsort = argsort(c) if d0 == 8321 and not show_membership: img = m[:,gsort][:,:maxgene] else: img = zeros((k,ng)) for i in range(ng): img[c[i],i] = 1 if d2 == 8321 and not show_membership: img2 = m2[:,gsort][:,:maxgene] else: img2 = zeros((k,ng)) for i in range(ng): img2[c2[i],i] = 1 #normalize to generate an image if nrml == 'axis': img2 = img2/nrml_type(img2,0)[newaxis,:] img = img/nrml_type(img,0)[newaxis,:] else: img2 = img2/nrml_type(img2,1)[:,newaxis] img = img/nrml_type(img,1)[:,newaxis] img /= np.max(img) img2 /=np.max(img) img_show= img[:,:,newaxis] *[0,0,1] + img2[:,:,newaxis]*[1,0,0] f = plt.figure(fig) f.clear() ax = f.add_axes([.05,.05,.9,.9]) ax.imshow(img_show[:,:,:], aspect = 'auto') nc = shape(img)[0] xs, ys, rs, cs = [[] for i in range(4)] nchoice = 1 if choice_ax == 'y': dim = shape(img)[0] maxes = [argsort(img,1)[::-1][:,:nchoice], argsort(img2,1)[::-1][:,:nchoice]] elif choice_ax == 'x': dim = shape(img)[1] maxes = [argsort(img,0)[::-1][:nchoice,:], argsort(img2,0)[::-1][:nchoice,:] ] else: raise Exception('bad axis') ct = mycolors.getct(len(maxes)) for j in range(len(maxes)): for i in range(dim): for k in range(nchoice): if choice_ax == 'x': ys.append(maxes[j][k][i]) xs.append(i) elif choice_ax =='y': xs.append(maxes[j][i][k]) ys.append(i) else: raise Exception('bad axis') rs.append(20 + 30*(1-j)) cs.append(ct[j]) xs, ys, rs, cs = np.array(xs),np.array(ys),np.array(rs),np.array(cs) ax.scatter(xs,ys,200,'1',edgecolor = 'none') ax.scatter(xs,ys,rs,cs,alpha = .8, edgecolor = 'none')
def sig_grid(num = 1 , method = 'tree', reset = False, plot_kcs = True, bp_means = False, bp_zeros = True, zero_ofs = 1e-6, bp_logs = True, show_kos = False, filter_rows_and_cols = False): #Make and annotate the heatmap figure f = plt.figure(1, facecolor = 'w') f.clear() axdims= .9 ax_box = array([.05,.05,axdims,axdims]) sg_big_hm_annotations(f, ax_box) #Set up the sizes of each group axis in the heatmap figure kwts = float(sum([len(v) for v in exps.values()])) mwidth = .015 msize = mwidth*kwts kw_total = kwts + ( msize * (len(exps)-1)) ofs = 0 allow_tf_kn = False if not allow_tf_kn: grid[zip(*knockout_cells)] = 0 #Some more heatmap cfguration. saturation = [np.percentile(grid[nonzero(greater(grid,0))],10), np.percentile(grid[nonzero(greater(grid,0))],90)] tf_srt = argsort(np.mean(grid,1)) all_bps = [] expsums = [np.mean( grid.T[v,:], 1) for v in exps.values()] max_sum = np.max((list(it.chain(*expsums)))) #For each experiment class, plot a heatmap and overlay per exp sums for k , v in exps.iteritems(): #Axes positioning wid = len(v) ax_ofs = array([ofs/kw_total, 0, (wid) / kw_total,1.]) ax_box = array([.05,.05,0.,0.]) ax_ofs = (ax_ofs * axdims) + ax_box #Make heatmap axes. ax = f.add_axes(ax_ofs, frameon = False) sums = np.mean(grid.T[v,:],1) exp_srt = argsort(sums)[::-1] hm.heatMap( grid.T[v[exp_srt],:][:,tf_srt], axes = ax, vmin = saturation[0], vmax = saturation[1]) #Make overlay axes. ax2 = f.add_axes(array(ax_ofs) + array([0,0,0,0]), frameon = True, axisbg = 'none', xticks = [], yticks = []) #Make the axes look the way I like em for a in ax2.spines.values(): a.set_linewidth(2) a.set_alpha(.5) these_knockouts = nonzero([c [1]in v for c in knockout_cells]) kc = knockout_cells[these_knockouts] kv = knockout_vals[these_knockouts] #If plot kcs is selected, plot the cells corresponding to TF deletion/OE if plot_kcs: if len(kc) > 0: ax.scatter(*zip(*[( list(v).index(x[1]),x[0]) for x in kc]), s =50, color = 'none', edgecolor = 'black', linewidth = 3) color = 'blue' ax2.plot(sums[exp_srt], linewidth = 4, color = color) if bp_means: bpelts = sums else: bpelts = grid.T[v,:].flatten() if not( bp_zeros ): bpelts = bpelts[nonzero(bpelts)] all_bps.append(bpelts) ax2.set_xlim([0,wid]) ax2.set_ylim([0,max_sum]) ax.set_xlim([0,wid]) ax.set_ylim([0,shape(grid)[0]]) ax2.set_xticks([]) #Annotate each axios tbb = matplotlib.transforms.Bbox(ax2.bbox).translated(0,-20) t = ax2.text(-2,0, k, va = 'bottom', ha = 'right', rotation = 90, color = 'black', size = 'x-large', family = 'serif') ofs += wid + msize #Make the boxplot figure f2 = plt.figure(3) plt.clf() if bp_means: bp_kos = array([ mean(grid.T[g[0],:],0) for g in it.groupby(sorted(\ [ko[1] for ko in knockout_cells])) ]) else: bp_kos = array(knockout_vals) if not bp_zeros: bp_kos = bp_kos[nonzero(bp_kos)] all_bps = all_bps + [bp_kos] ax3 = f2.add_subplot('111') if bp_logs: all_bps = [log(b + zero_ofs) for b in all_bps] bp_lzero = log(zero_ofs) boxplots = ax3.boxplot([bp for bp in all_bps], widths= .5) for p in boxplots.values(): for e in p: e.set_linewidth(4) #Annotate the boxplot figure ann_str = '' for i in range(8): ann_str += '{0}: {1}\n'.format(i+1, (exps.keys() + ['TF Knockout/OE'])[i]) ax3.annotate(ann_str, [0,1],xycoords = 'axes fraction', xytext = [10,-10], textcoords = 'offset pixels', va = 'top', ha = 'left') ax3.set_title('''Boxplot of significances per experiment type for {3} learning method, Net {4} Filtered out were {0} cells corresponding to {1} TFs Knocked out or OverExpressed. {2} of these cells have nonzero importance and are plotted at x=9, Showing Means: {5}, Showing zeros: {6}, Plotting logs {7}'''.\ format(len(knockout_cells), len(knockout_tfs), len(nonzero(knockout_vals)[0]), method, num, bp_means, bp_zeros, bp_logs)) ax3.set_ylabel('significance') ax3.set_xlabel('experiment class') f.savefig(cfg.dataPath('daniel/figs/{0}_net{1}_heatmaps.tiff'.format(method, num)), format = 'tiff') plam = lambda: filter_rows_and_cols and 'nonzero_exps_and_tfs_cells_log/'\ or bp_zeros and not bp_logs and bp_means and 'zeros_means_nolog/'\ or not bp_zeros and bp_means and not bp_logs and 'nozeros_means_nolog/'\ or not bp_zeros and bp_means and bp_logs and 'nozeros_means_log/'\ or bp_zeros and not bp_means and not bp_logs and 'zeros_cells_nolog/'\ or not bp_zeros and not bp_means and not bp_logs and 'nozeros_cells_nolog/'\ or not bp_zeros and not bp_means and bp_logs and 'nozeros_cells_log/' dataDir = cfg.dataPath('daniel/figs/{2}{0}_net{1}_boxplots.tiff'.\ format(method, num,plam())) print 'saving {0}'.format(dataDir) if not os.path.isdir(os.path.dirname(dataDir)): os.mkdir(os.path.dirname(dataDir)) if os.path.isfile(dataDir): os.remove(dataDir) f2.savefig(dataDir, format = 'tiff') mean_xvals = [ mean(all_bps[i][nonzero(greater(all_bps[i],bp_lzero))]) for i in range(len(all_bps))] pdfs, xvals = zip(*[histogram(x, bins=50, range=[-15,8], normed=False) for x in all_bps]) import compbio.utils.colors as colors c = colors.getct(len(pdfs)) f3 = plt.figure(3) f3.clear() sax = f3.add_subplot('111') seismic.seismic([array(x,float)/ sum(x) for x in pdfs], xax = xvals[0][:-1],stacked = False, colors = c, xmarkpts = mean_xvals, ax = sax) f4 = plt.figure(4) f4.clear() ax = f4.add_subplot('121') ax.set_title('(log base 10) of Percentage Nonzero for Experiment Classes') percs = log10(array([100*float(len(nonzero(greater(x,bp_lzero))[0])) / len(x) for x in all_bps])) ax.plot(percs,linewidth = 6) ax.set_yticks(percs) names = exps.keys() + ['TF Knockout/OE'] ax.set_yticklabels(['{1}\n{0}'.format('%2.2f' % (10**p), names[idx]) for idx,p in enumerate(percs)]) ax2 = f4.add_subplot('122') ax2.set_title('Mean of Nonzero Experiments for Experiment Classes') means = array([mean(bp[nonzero(greater(bp,bp_lzero))]) for bp in all_bps]) ax2.plot(arange(1,9), means,linewidth = 6) ax2.boxplot( [bp[nonzero(greater(bp,bp_lzero))] for bp in all_bps], widths = .5) ax2.set_yticks(means) names = exps.keys() + ['TF Knockout/OE'] ax2.set_yticklabels(['{1}\n{0}'.format('%2.2f' % (p), names[idx]) for idx,p in enumerate(means)])
def expr_gmm_onoff(expr_in, log_expr = False, fig = 1, draw = False): expr = (array(expr_in)) dev = std(expr) if log_expr: expr = log(expr + dev) n = len(expr) expr_array = zeros((n,1)) for i in range(n): expr_array[i] = expr[i] expr = expr_array from scikits.learn import gmm #demand seperation of max from alternate hypotheses by e/2 cmin_diff = log(e*1.5) #cmin_diff = .0001 k = 2 G = gmm.GMM(n_states = k, n_dim = 1) G.fit(expr) [probs, clusters] = G.decode(expr) [probs, mixtures] = G.eval(expr) mean_as = argsort(G.means,0) for i in range(shape(mixtures)[0]): mixtures[i,:] = mixtures[i,squeeze(mean_as)] if draw: n = len(expr) xax = arange(n)[argsort(expr,0)] f = plt.figure(fig) f.clear() ax = f.add_axes([0,0,1,1]) ct =mycolors.getct(k) cs, rs = [], [] c2s = [] r2s = [] x2s = [] y2s = [] for i in range(n): cs.append(ct[mean_as[clusters[i]]]) rs.append(100) for j in range(k): mprob = mixtures[i,j] x2s.append(i) y2s.append(expr[i]) c2s.append(ct[j]) r2s.append(pow(exp(mprob),2)*100) x3s,y3s,c3s,r3s = [], [], [], [] for i in range(n): probs = mixtures[i,:] cval = clusters[i] srt = argsort(probs)[::-1] maxval = probs[srt[0]] secval = probs[srt[1]] reliable = False if log(maxval) - log(secval) > cmin_diff: reliable = True x3s.append(i) y3s.append(expr[i]) r3s.append(200) if not reliable: color = [0,0,0] else: color = ct[srt[0]] c3s.append(color) #ax.scatter(xax,expr,rs, color = cs) ax.scatter(x2s,y2s,r2s,edgecolor=c2s,facecolor = 'none') ax.scatter(x3s,y3s,r3s,c3s) return mixtures
def dsi_boxplot(num = 1 , method = 'tree', reset = False, plot_kcs = True, bp_means = False, bp_zeros = True, zero_ofs = 1e-6, bp_logs = True, show_kos = True, log_scale = True, filter_rows_and_cols = True, boxplot = True): grid, descriptions = parseNet(num= num, method = method, reset = reset) grid = array(grid) descriptions = dict(descriptions) new_descriptions = {} if filter_rows_and_cols: #Filter out bad rows and columns good_exps = nonzero(np.max(grid,0))[0] tf_new_idxs = list(argsort(np.max(grid,1))[::-1]) new_grid = grid[tf_new_idxs] good_tfs = nonzero(np.max(new_grid,1))[0] #Relabel the descriptions to take filtration into account #Assumed that one based indexing may be causing havoc so subtract one from the group. for k, value in descriptions.iteritems(): if 'Genes' in k: new_descriptions[k] = [re.sub(re.compile('(\d+)'),\ lambda x: int(x.group()) in tf_new_idxs and str(tf_new_idxs.index(int(x.group()))) or x.group(), g) for g in value] else: new_descriptions[k] = value new_descriptions[k] = list(array(new_descriptions[k])[good_exps]) new_grid = new_grid[good_tfs, :] new_grid = new_grid[ :,good_exps] grid = new_grid descriptions = new_descriptions #Make lambdas to split experiments into categories col_choosers = sg_choosers() #Split experiments exps = {} for k, v in col_choosers.iteritems(): vs = [ dict(zip(descriptions.keys() , elt)) for elt in zip(*descriptions.values()) ] exps[k] = nonzero( [v(e) for e in vs ])[0] '''Remove 'general' as the values wind up being all zeros.''' exps.pop('general') #Mark experiments that knock out TFS tf_kn_matches =[ sorted(list(it.chain(\ nonzero([ 'G{0},'.format(t) in x+',' for x in descriptions['DeletedGenes'] ])[0], nonzero([ 'G{0},'.format(t) in x+',' for x in descriptions['OverexpressedGenes'] ])[0]))) for t in range(shape(grid)[0])] knockout_tfs = nonzero([len(k) for k in tf_kn_matches])[0] knockout_cells = array(list(it.chain(*[ [(i, exp) for exp in tf_kn_matches[i] ] for i in range(len(tf_kn_matches))]))) knockout_vals = grid[zip(*knockout_cells)] do_final_bps = True kn_exps = {} split_ko_ts = False kn_exps['ko'] = [] def getBPS(**kwargs): xlabels = [] nz_frac_std = [] nz_frac_mean = [] nz_val_std = [] nz_val_mean = [] nz_colvals = [] for k, ecols in exps.iteritems(): these_knockouts = array([c for c in knockout_cells if c[1] in ecols]) exp_cells = array([(i,j) for j in ecols for i in arange(shape(grid)[0])]) if these_knockouts != []: kns_found = [c for c in exp_cells if np.sum(greater( np.product(c==these_knockouts,1),0),0)] kn_exps['ko'] += kns_found nokns_found = [c for c in exp_cells if not np.sum(greater( np.product(c==these_knockouts,1),0),0)] else: nokns_found = exp_cells cexp = [grid[zip(*exp_cells[\ nonzero(equal(exp_cells[:,1],col))[0]])] \ for col in ecols] if cexp == []: for arr in [nz_frac_std, nz_frac_mean, nz_val_std, nz_val_mean]: arr.append(0.) nz_colvals.append([]) xlabels.append(k) continue colwise_fracs = [mean(1.*greater(col,0)) for col in cexp] colwise_exprs = [mean(col[nonzero(greater(col,0))]) for col in cexp] colwise_exprs = [c if not isnan(c) else 0 for c in colwise_exprs] nz_colvals.append(colwise_exprs) nz_frac_std.append(std(colwise_fracs)/sqrt(len(colwise_fracs))) nz_frac_mean.append(mean(colwise_fracs)) nz_val_std.append(std(colwise_exprs)/sqrt(len(colwise_exprs))) nz_val_mean.append(mean(colwise_exprs)) if isnan(nz_val_mean[-1]): raise Exception() xlabels.append(k) for k, ecells in kn_exps.iteritems(): ecells = array(ecells) nz_frac_std.append(0) nz_val_std.append(0) if len(ecells) == 0: for arr in [nz_frac_mean, nz_val_mean]: arr.append(0.) nz_colvals.append([]) else: nz_frac_mean.append(mean(greater(grid[zip(*ecells)],0))) nz_val_mean.append(mean(grid[zip(*ecells[greater(grid[zip(*ecells)],0)])])) nz_colvals.append(grid[zip(*ecells[greater(grid[zip(*ecells)],0)])]) xlabels.append(k) return xlabels, array(nz_frac_std),array(nz_val_std),array(nz_frac_mean), array(nz_val_mean), [array(cv) for cv in nz_colvals] xlabels, nz_frac_std,nz_val_std,nz_frac_mean, nz_val_mean, nz_colvals = mem.getOrSet(getBPS,on_fail = 'compute', reset = reset) args = [xlabels.index(x) for x in ['general_ts', 'drug', 'drug_ts', 'genetic', 'genetic_ts', 'drug_genetic', 'drug_genetic_ts', 'ko'] if x in xlabels] xlabels, nz_frac_std,nz_cal_std,nz_frac_mean,nz_val_mean =\ array(xlabels)[args],nz_frac_std[args],nz_val_std[args],nz_frac_mean[args],nz_val_mean[args] nz_colvals = [nz_colvals[a] for a in args] f = plt.figure(0) f.clear() topen = open(cfg.dataPath('daniel/txt/net{0}_{1}'.format(num,method )),'w') topen.write('\t'.join(['exp_class','mean_influence','std_influence','stderr_influence'])+'\n') for idx, exp_class in enumerate(xlabels): topen.write('{0}\t{1}\t{2}\t{3}\n'.format(exp_class,mean(nz_colvals[idx]),std(nz_colvals[idx]),\ std(nz_colvals[idx])/ len(nz_colvals[idx]))) topen.close() plot_type = 'dsi_final' if plot_type == 'dsi_final': margin = .05 wid0 = .75 cs = mycolors.getct(len(nz_colvals)) ax0 = f.add_axes([margin,margin, wid0 , 1. - 2* margin], title = 'Experminent mean significances: blue (red) lines denote quartiles (media).') if log_scale: ax0.set_yscale('log') #ax0.set_autoscaley_on(False) if boxplot: ax0.boxplot(nz_colvals[0:-1], widths = [.5] * (len(nz_colvals )-1)) ax0.hlines([mean(nz_colvals[-1])],-100, 100,color = 'red',linestyle = ':',linewidth = 1) else: ax0.bar(.2 + arange(len(nz_colvals[0:-1])), [median(c) for c in nz_colvals[0:-1]], color = cs[:-1]) ax0.set_xticklabels(xlabels[:-1]) if boxplot: pass #ax0.set_ylim([min(nz_colvals[:-1]), max(nz_colvals[:-1])/10]) #ax1 = f.add_axes([2*margin +wid0, margin, (1 - margin) - (2 * margin + wid0), 1- 2* margin],sharey = ax0, title = 'TF knockout/OE') #if boxplot: # ax1.boxplot(nz_colvals[-1:],widths = .5) #else: # ax1.bar([.2],[mean(c) for c in nz_colvals[-1:]], # color = cs[-1:]) #ax1.set_xticklabels(xlabels[-1:]) if boxplot: pass #ax1.set_ylim([np.min([min(c) for c in nz_colvals[:-1]]), np.max([max(c) for c in nz_colvals[:-1]])]) f.savefig(cfg.dataPath('daniel/figs/final_bp_net{0}_{1}_{2}.ps'.\ format(num, method, 'log' if log_scale else 'lin')), dpi = 10) return elif plot_type == 'twoplots': nkeys = len(xlabels) if show_kos: xi = arange(nkeys) else: xi = arange(nkeys -1) y1 = nz_val_mean[xi] s1 = nz_val_std[xi] y2 = nz_frac_mean[xi] s2 = nz_frac_std[xi] a1 = f.add_subplot(211, ylim =[0, max(y1)+max(s1)], title = 'mean value of nonzero influences\n standard error across experiments') a2 = f.add_subplot(212, ylim =[0,max(y2)+ max(s2)], title = 'mean values of fraction nonzero influences\n standard error across experiments' ) colors = mycolors.getct(nkeys) wofs = .15 b1 = a1.bar(xi+wofs,y1,1.-wofs*2, linewidth = 3,color = colors, ecolor = 'black') b2 = a2.bar(xi+wofs,y2,1.-wofs*2, linewidth = 3,color = colors, ecolor = 'black' ) p1,c1,b1 = a1.errorbar(xi+.5, y1, yerr = s1,capsize = 15, elinewidth = 4, color = 'black',linewidth = 0, ecolor = 'black') p2,c2,b2 = a2.errorbar(xi+.5, y2, yerr = s2,capsize = 15, elinewidth = 4, color = 'black',linewidth =0, ecolor = 'black') for c in c1:c.set_alpha(1.) for c in c2:c.set_color('black') for c in a2.get_children() + a1.get_children(): try: if not c in [p1,p2]: c.set_linewidth(4) except: pass continue a2.set_xticklabels([]) for i in xi: a2.text( float(i) + .5,0,xlabels[i] , rotation = '-15',size = '16', ha = 'left',va='top') f.savefig(cfg.dataPath('daniel/figs/latest/{1:03d}_{0}_{2}.tiff'.\ format('no_kos' if not show_kos else 'kos', num , 'log' if log_scale else 'lin')),format = 'tiff') return