def make_dr_plots(R, choice='samples'): if choice == 'samples': r_members = R.data.samples r_matrix = R.data.matrix('log') out_name = R.args.prefix + '_samples_' else: r_members = R.data.features r_matrix = R.data.matrix('log').getT() out_name = R.args.prefix + '_features_' dr = rage_DR.DR(R.args, R.progress) pca_run = dr.run_pca(r_matrix) dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, pca_run, { 'title': 'PCA', 'out': out_name + 'pca.pdf' }) tsne_run = dr.run_tsne() dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, tsne_run, { 'title': 'TSNE', 'out': out_name + 'tsne.pdf' }) kca_run = dr.run_kca(r_matrix) dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_run, { 'title': 'KCA', 'out': out_name + 'kca.pdf', 'zoom': True }) return pca_run, tsne_run, kca_run
def evaluate_model2(self): self.progress.start_minor('Running Model Regressions', len(self.D.features), False) for dist in self.options.dist: print 'yo' M = rt.RegModel(self.X, dist, self.options, self.progress, True).run(self.Y, self.feature_names).aggregate(True) print 'yo' M_resids, C_resids = M.get_resids(COVAR=True) print 'yo' M_out = rage_regression_outputs.eval_output(self.options).write( M, self.feature_names) #[f.name for f in self.D.features]) Mc = rt.RegModel(self.Xc, dist, self.options).run( self.Y, self.feature_names).aggregate(True) #Mp = rt.RegModel(self.Xp,dist,self.options).run(self.Y,self.feature_names).aggregate(True) print 'yo' sims = dd(list) self.progress.start_minor('Running Model PCA', len(self.D.features), False) dim = rage_DR.DR( self.options, self.progress) #.set_fit_matrix(self.D.matrix('log')) pca_init = dim.set_y_matrix( self.Y, LOG_TRANSFORM=dist[-3::] != 'LOG').pca(req='brief') # pca_c_resid = dim.set_y_matrix(C_resids,LOG_TRANSFORM = dist[-3::] != 'LOG').pca(req='brief') pca_resid = dim.set_y_matrix( M_resids, LOG_TRANSFORM=dist[-3::] != 'LOG').pca(req='brief')
def run_tsne(self,R): dr = rage_DR.DR(R.args,R.progress) out_name = R.args.prefix+'_transformsamples_TSNE' R.progress.start_minor('TSNE') r_members = R.data.samples r_matrix = R.data.matrix('log') tsne_run = dr.run_tsne(r_matrix) dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,[tsne_run],{'title':'TSNE','out': out_name+'tsne.pdf'}) dimplot.finish(out_name+'tsne.pdf') R.progress.end()
def run_condense_pca(self, R): # R = self.rage D = R.data out_name = R.args.prefix + '_condensepca_' shared_features = [ f.name for f in R.condensed_data.features if f.name in [fm.name for fm in D.features] ] cF = [ f for f in R.condensed_data.features if f.name in shared_features ] rF = [f for f in D.features if f.name in shared_features] Yc = [[s.cnts[f.idx] for s in R.condensed_data.samples] for f in cF] Y = [[s.cnts[f.idx] for s in D.samples] for f in rF] dr = rage_DR.DR(R.args, R.progress) out_name = R.args.prefix + '_transformsamples_' R.progress.start_minor('PCA') r_members = R.data.samples condense_run = dr.set_y_matrix(Yc, LOG_TRANSFORM=True).pca(req='FULL') self.write_coeffs(condense_run['coefs'], cF, out_name + 'coefs.out') pca_run = dr.set_y_matrix(Y, LOG_TRANSFORM=True, SET_TRANSFORM=True).pca(req='FULL') dimplot = dplot.dimplot(1, 1, R.args, R.progress) shared_run = { 'pts': condense_run['pts'] + pca_run['pts'], 'axes': condense_run['axes'] } shared_samples = [s for s in R.condensed_data.samples ] + [s for s in R.data.samples] dimplot.add_data(shared_samples, [shared_run], { 'title': 'SHARED_PCA', 'out': out_name + 'pca.pdf' }) # dimplot.add_data(R.condensed_data.samples,[condense_run],{'title':'PCA','out': out_name+'pca.pdf'}) # dimplot.add_data(R.data.samples,[pca_run],{'title':'PCA','out': out_name+'pca.pdf'}) dimplot.finish(out_name + 'pca') R.progress.end() sys.exit()
def make_pca_and_tsne_plots(self): seaborn.set(rc={ 'axes.facecolor': 'black', 'figure.facecolor': 'cornflowerblue' }) my_sizes = scale_vals([len(s.cnts.keys()) for s in self.input.samples], 20, 55) self.progress.start_subtopic('Calculating PCA/TSNE', '', 0) data_matrix = self.input.data_matrix('log') dr = rage_DR.DR(self.args, self.progress).set_matrix(data_matrix) dr.run_pca().run_kca().run_tsne().run_ica() subplot = rage_subplots.subplot(2, 2, self.args) subplot.add_legend(self.color_key.keys(), self.color_key.values()) subplot.add_pca_data( dr.pca_pts, { 'vars': dr.pca_vars, 'title': 'PCA', 'colors': self.color_labels, 'sizes': my_sizes }).update({'clear_axes': True}) subplot.add_pca_data( dr.kca_pts, { 'type': 'kca', 'title': 'KCA', 'colors': self.color_labels, 'zoom': True, 'sizes': my_sizes }).update({'clear_axes': True}) subplot.add_pca_data( dr.ica_pts, { 'type': 'ica', 'title': 'ICA', 'colors': self.color_labels, 'sizes': my_sizes }).update({'clear_axes': True}) subplot.add_pca_data(dr.tsne_pts, { 'type': 'tsne', 'colors': self.color_labels, 'sizes': my_sizes }).update({'clear_axes': True}) #subplot.add_legend(self.color_key.keys(),self.color_key.values()) subplot.save(self.args.prefix + '_dimred.png', {}) self.progress.finish_subtopic()
def iterative_run(self,Y,S_NAMES,iteration=1,GROUPS=2,SELECTION_FRACTION=0.24,MAX_SIZE=4,DIMS=3,TAIL=False,LOWER=False,LOWEST=False): LT = ('log' in self.options.notes) SC = ('scale' in self.options.notes) STD = ('std' in self.options.notes) DIMS=2 if GROUPS == 2: dr = rage_DR.DR(self.R.args,self.R.progress) #dr.set_y_matrix(Y, LOG_TRANSFORM=LT,SCALE=SC,STD_SCALE=STD) dr.set_y_matrix(Y, CENTER=False,SCALE=True) pca_run = dr.pca(req='FULL') pca_pts = pca_run['pts'] coeff_key = dd(list) for i,C in enumerate(pca_run['coefs']): for j,(vs,vl,vi) in enumerate(C): coeff_key[self.R.data.features[vi].name].append((j,vl)) S_CNTS = [sorted([(Y[i][j],self.R.data.features[i].name) for i in range(len(Y)) if Y[i][j] > 0],reverse=True) for j in range(len(S_NAMES))] S_OBS = {S_NAMES[j]: len(S_CNTS[j]) for j in range(len(S_NAMES))} S_MED = np.median(S_OBS.values()) S_25 = np.percentile(S_OBS.values(),25) P_LEN = max(min([len(sc) for sc in S_CNTS]),100) S_PROJECTIONS = dd(lambda: dd(list)) for i,C in enumerate(S_CNTS): for v,f in C[0:P_LEN]: for j in range(len(coeff_key[f])): S_PROJECTIONS[S_NAMES[i]][j].append(v*coeff_key[f][j][1]) S_PROJECTIONS = {s_name: [np.mean(S_PROJECTIONS[s_name][j]) for j in range(DIMS)] for s_name in S_PROJECTIONS.keys()} dr = rage_DR.DR(self.R.args,self.R.progress) #dr.set_y_matrix(Y, TRANSPOSE = True, SCALE=True) dr.set_y_matrix(Y, TRANSPOSE=True,CENTER=False,SCALE=True) pca_tran = dr.pca(req='FULL') s_key = dd(list) for i,C in enumerate(pca_tran['coefs']): for j,(vs,vl,vi) in enumerate(C): s_key[S_NAMES[vi]].append((j,vl)) print DIMS # print '--- name2 idx1 idx2 obs1 obs2 | aCoef bCoef' print '--- idx1 obs1 | aCoef aPCA aProj', print '| idx2 obs2 | bCoef bPCA bProj' for i in range(len(S_NAMES)): aName = S_NAMES[i] aCoefs,aPCA, aProj,aObs = [x[1] for x in s_key[aName]][0:DIMS], pca_pts[i][0:DIMS], S_PROJECTIONS[aName], S_OBS[aName] for j in range(i+1,len(S_NAMES)): print aName,i,aObs,'|', print ",".join([str(round(x,3)) for x in aCoefs]), print ",".join([str(round(x,3)) for x in aPCA]), print ",".join([str(round(x,3)) for x in aProj]),'|', bName = S_NAMES[j] bCoefs,bPCA, bProj,bObs = [x[1] for x in s_key[bName]][0:DIMS], pca_pts[j][0:DIMS], S_PROJECTIONS[bName], S_OBS[bName] print bName,j,bObs,'|', print ",".join([str(round(x,3)) for x in bCoefs]), print ",".join([str(round(x,3)) for x in bPCA]), print ",".join([str(round(x,3)) for x in bProj]), coefD = np.linalg.norm(np.array(aCoefs) - np.array(bCoefs)) coefP = np.linalg.norm(np.array(aPCA) - np.array(bPCA)) coefPr = np.linalg.norm(np.array(aProj) - np.array(bProj)) print '|', coefD, coefP,coefPr sys.exit() if TAIL: s_order = sorted([(s_key[s][0][1],s) for s in s_key.keys()]) else: s_order = sorted([(s_key[s][0][1],s) for s in s_key.keys()],reverse=True) PAIRS, S_STOP, FOUND,SCAN, ADDED = [], len(s_order) * SELECTION_FRACTION, dd(bool) , 10, 0 for i in range(len(s_order)): i_val, i_name = s_order[i] if LOWER and S_OBS[i_name] > S_MED: continue elif LOWEST and S_OBS[i_name] > S_25: continue i_size, i_loc,i_dists, j = len(i_name.split('@')), np.array(S_PROJECTIONS[i_name] ),[], i + 1 if FOUND[i] or i_size >= MAX_SIZE: continue while j < len(s_order): if not FOUND[j] and (i_size+len(s_order[j][1].split('@'))) <= MAX_SIZE: j_val, j_name = s_order[j] i_dists.append((np.linalg.norm(i_loc - np.array(S_PROJECTIONS[j_name])),j_name,j)) if len(i_dists) >= SCAN: break j+=1 if len(i_dists) > 0: dist,j_name,j_idx = sorted(i_dists)[0] PAIRS.append((i_name,j_name)) FOUND[j_idx], FOUND[i_name], FOUND[j_name] = True, True, True ADDED +=1 if ADDED > S_STOP: break NEW_Y, NEW_IDX, IDX_KEY = [], {}, {s: i for i,s in enumerate(S_NAMES)} for a,b in PAIRS: NEW_IDX[a+"@"+b] = [IDX_KEY[a],IDX_KEY[b]] for s,i in [(s,i) for (s,i) in IDX_KEY.items() if s not in FOUND]: NEW_IDX[s] = [i] NEW_SAMPLE_NAMES = NEW_IDX.keys() #for y in Y: NEW_Y.append([max([y[j] for j in NEW_IDX[ns]]) for ns in NEW_SAMPLE_NAMES]) #for y in Y: NEW_Y.append([sum([y[j] for j in NEW_IDX[ns]]) for ns in NEW_SAMPLE_NAMES]) for y in Y: NEW_Y.append([np.mean([y[j] for j in NEW_IDX[ns]]) for ns in NEW_SAMPLE_NAMES]) return pca_run, NEW_Y, NEW_SAMPLE_NAMES
def run_iterative_pca(self,R,GROUPS=2): self.R = R D, F, S = R.data , R.data.features, R.data.samples Y = [[s.cnts[f.idx] for s in D.samples] for f in D.features] self.D = R.data ### PARAMS ### GROUPS = 2 SELECTION_FRACTION = 0.24 dr = rage_DR.DR(R.args,R.progress) out_name = R.args.prefix+'_transformsamples_' R.progress.start_minor('PCA') LT = ('log' in self.options.notes) SC = ('scale' in self.options.notes) STD = ('std' in self.options.notes) if GROUPS == 200: iterplot = dplot.iterplot(self.R.data.samples,self.R.data.features,R.args,R.progress,6) #sc = MinMaxScaler() #Y = [[x[0] for x in sc.fit_transform(np.array(y).reshape(-1,1))] for y in Y] # dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(R.args.prefix+'_transformsamples_sample_pca') PCA_1, Y_1, S_1 = self.iterative_run(Y,[s.name for s in S],iteration=1,SELECTION_FRACTION=0.02,MAX_SIZE=2,DIMS=1) #LOWEST=True) for s in [s for s in S_1 if len(s.split('@')) > 1]: print 1,s iterplot.add_data(PCA_1, [s.name for s in self.R.data.samples]) PCA_2, Y_2, S_2 = self.iterative_run(Y_1,S_1,iteration=2,SELECTION_FRACTION=0.05,MAX_SIZE=2,DIMS=1,LOWER=True) for s in [s for s in S_2 if len(s.split('@')) > 1]: print 2,s iterplot.add_data(PCA_2, S_1) PCA_3, Y_3, S_3 = self.iterative_run(Y_2,S_2,iteration=2,SELECTION_FRACTION=0.025,MAX_SIZE=3,DIMS=1,LOWEST=True) for s in [s for s in S_3 if len(s.split('@')) > 1]: print 3,s iterplot.add_data(PCA_3, S_2) PCA_4, Y_4, S_4 = self.iterative_run(Y_3,S_3,iteration=2,SELECTION_FRACTION=0.50,MAX_SIZE=2,DIMS=1,TAIL=False) for s in [s for s in S_4 if len(s.split('@')) > 1]: print 4,s iterplot.add_data(PCA_4, S_3) # PCA_5, Y_5, S_5 = self.iterative_run(Y_4,S_4,iteration=2,SELECTION_FRACTION=0.12,MAX_SIZE=3,DIMS=1,TAIL=True,LOWER=True) # for s in [s for s in S_5 if len(s.split('@')) > 1]: print 5,s # iterplot.add_data(PCA_5, S_4) # PCA_6, Y_6, S_6 = self.iterative_run(Y_5,S_5,iteration=2,SELECTION_FRACTION=0.40,MAX_SIZE=4,DIMS=2,TAIL=False) # for s in [s for s in S_6 if len(s.split('@')) > 1]: print 6,s # iterplot.add_data(PCA_6, S_5) plt.savefig('PCA_ITERATIVE.pdf') #print cn, pn, np.linalg.norm(c_pts-p_pts) #dimplot = dplot.dimplot(1,1,R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3)],NAMES=True).finish(out_name+'named.pca') #dimplot = dplot.dimplot(1,1,R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(1,2),(3,4),(5,6)],NAMES=True).finish(out_name+'named.pca') #dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(out_name+'Cpca') dr.set_y_matrix(Y, LOG_TRANSFORM=LT,SCALE=SC,STD_SCALE=STD) pca_run = dr.pca(req='FULL') self.coeff_key, sample_data = self.write_pca_data(pca_run, S, F, R.args.prefix+'_sampletransform_') dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples, [pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(R.args.prefix+'_transformsamples_sample_pca') s_pts = [[p[1] for p in self.coeff_key[f.name]] for f in R.data.features] pca_run['pts'] = s_pts dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.features,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=True).finish(R.args.prefix+'_transformsamples_feature_pca') dr.set_y_matrix(Y, TRANSPOSE = True, SCALE=True) pca_tran = dr.pca(req='FULL') self.s_key, feature_data = self.write_pca_data(pca_tran, F,S, R.args.prefix+'_featuretransform_') dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.features,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=True).finish(R.args.prefix+'_transformfeatures_feature_pca') s_pts = [[p[1] for p in self.s_key[s.name]] for s in R.data.samples] pca_tran['pts'] = s_pts dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_tran],dim_comps=[(0,1),(2,3),(4,5),(6,7)], NAMES=False).finish(R.args.prefix+'_transformfeatures_samples_pca')
def run_pca(self,R): self.LT, self.SC, self.STD, ttype, cstr = False, False, False, '_', '_' if len(self.options.color)>0: cstr+= '-'.join(self.options.color) if self.options.marker != None: cstr+= '-'+self.options.marker if 'log' in self.options.notes: self.LT= True ttype += 'LOG_' if 'scale' in self.options.notes: self.SC = True ttype += 'SCALE_' elif 'std' in self.options.notes: self.STD = True ttype += 'STD_' if len(ttype) < 3: ttype = '_RAW' self.out_name = R.args.prefix+'_'+cstr+'_transformsamples'+ttype self.plt_name = R.args.prefix+'_'+cstr+'_transformsamples'+ttype self.D = R.data self.S = self.D.samples self.Y = [[s.cnts[f.idx] for s in self.D.samples] for f in self.D.features] if self.options.coeffs: self.precomp_pca(R,self.options.coeffs) return else: dr = rage_DR.DR(R.args,R.progress) R.progress.start_minor('PCA') dr.set_y_matrix(self.Y, LOG_TRANSFORM=self.LT,SCALE=self.SC,STD_SCALE=self.STD) pca_run = dr.pca(req='FULL') F_key = dd(list) for i,C in enumerate(pca_run['coefs']): for j,(vs,vl,vi) in enumerate(C): F_key[self.D.features[vi].name].append((j,vl)) w= open(self.out_name+'pca_coefs.out','w') w.write("%-50s %5s %10s %5s %10s %5s %10s\n" % ('---','R1','V1','R2','V2','R3','V3')) for k,C in F_key.items(): w.write("%-50s" % (k)) for i in range(len(C)): w.write(" %5d %10f" % (C[i][0],C[i][1])) w.write('\n') w.close() w= open(self.out_name+'pca_pts.out','w') w.write("%-50s %10s %10s %10ss %10s %10s \n" % ('---','PC1','PC2','PC3','PC4','PC5')) for p,s in zip(pca_run['pts'],R.data.samples): w.write("%-50s %10f %10f %10f %10f %10f\n" % (s.name,p[0],p[1],p[2],p[3],p[4])) w.close() dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(0,1),(2,3),(4,5),(6,7)],NAMES=False).finish(self.plt_name+'_sample_pca') dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[pca_run],dim_comps=[(8,9),(10,11),(12,13),(14,15)],NAMES=False).finish(self.plt_name+'_sample_hipca') R.progress.end() tsne_run = dr.tsne() w= open(self.out_name+'tsne_pts.out','w') w.write("%-50s %10s %10s \n" % ('---','TS1','TS2')) for p,s in zip(tsne_run['pts'],R.data.samples): w.write("%-50s %10f %10f \n" % (s.name,p[0],p[1])) w.close() dimplot = dplot.dimplot(R.args,R.progress).add_data(R.data.samples,[tsne_run],dim_comps=[(0,1)],NAMES=False).finish(self.plt_name+'_sample_tsne') R.progress.end()
def precomp_pca(self,R,coeffs,PLEN=1500,MAX_COEFS=8): coeff_key = dd(lambda: {}) scale_key = dd(lambda: {}) projection_key = dd(lambda: {}) for line in coeffs: line = line.split() if line[0] == '---': continue for i in range(2,len(line),2): coeff_key[(i/2)-1][line[0]] = float(line[i]) if i >= 40: break for f in self.D.features: if f.name not in coeff_key[0]: for i in coeff_key.keys(): coeff_key[i][f.name] = 0.0 for s in self.S: projection_key[s] = sorted([[c,self.D.features[i].name,[coeff_key[n][self.D.features[i].name] for n in range(len(coeff_key))]] for i,c in s.cnts.items()],reverse=True) prj_len = max(min([len(X) for X in projection_key.values()]),PLEN) self.plt_name+='_projected_'+str(prj_len) pca_key = dd(list) tsne_key = {} for s in self.S: LK_DATA = [[pk[2][n]*log(pk[0],2) for pk in projection_key[s]] for n in range(len(coeff_key.keys()))] RK_DATA = [[pk[2][n]*log(pk[0],2) for pk in projection_key[s]] for n in range(len(coeff_key.keys()))] RK_DOT = [sum(rk) for rk in RK_DATA] LK_DOT = [sum(lk) for lk in LK_DATA] RK_PROJ = [sum(rk[0:prj_len]) for rk in RK_DATA] LK_PROJ = [sum(lk[0:prj_len]) for lk in LK_DATA] pca_key['LOGDOT'].append(LK_DOT) pca_key['RAWDOT'].append(RK_DOT) pca_key['LOGPRJ'].append(LK_PROJ) pca_key['RAWPRJ'].append(RK_PROJ) rawdot = {'pts': pca_key['RAWDOT'], 'axes': ['PC'+str(x+1)+'-RAWDOT' for x in range(len(coeff_key.keys()))]} logdot = {'pts': pca_key['LOGDOT'], 'axes': ['PC'+str(x+1)+'-LOGDOT' for x in range(len(coeff_key.keys()))]} rawprj = {'pts': pca_key['RAWPRJ'], 'axes': ['PC'+str(x+1)+'-RAWPRJ' for x in range(len(coeff_key.keys()))]} logprj = {'pts': pca_key['LOGPRJ'], 'axes': ['PC'+str(x+1)+'-LOGPRJ' for x in range(len(coeff_key.keys()))]} for kp,kpts in pca_key.items(): w_name = self.plt_name+'_'+kp+'_pca_proj.pts' w= open(w_name,'w') w.write("%-50s %10s %10s %10ss %10s %10s \n" % ('---','PC1','PC2','PC3','PC4','PC5')) for si,p in enumerate(kpts): s = self.S[si] w.write("%-50s %10f %10f %10f %10f %10f\n" % (s.name,p[0],p[1],p[2],p[3],p[4])) w.close() for dc in [(0,1),(2,3)]: p_name = self.plt_name+'_'+'-'.join([str(ss) for ss in dc])+'_' dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,[rawdot,logdot,rawprj,logprj],dim_comps=[dc,dc,dc,dc],NAMES=False).finish(p_name+'pca') dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,[rawdot,logdot,rawprj,logprj],dim_comps=[dc,dc,dc,dc],NAMEOUTLIERS=True).finish(p_name+'exnamed_pca') dr = rage_DR.DR(R.args,R.progress) tsne_key = {k: dr.tsne(pca_pts=vals,axes_prefix=k) for k,vals in pca_key.items()} t_runs = [tsne_key['RAWDOT'],tsne_key['LOGDOT'],tsne_key['RAWPRJ'],tsne_key['LOGPRJ']] for kp,kpts in tsne_key.items(): w_name = self.plt_name+'_'+kp+'_tsne_proj.pts' w= open(w_name,'w') w.write("%-50s %10s %10s\n" % ('---','TSNE1','TSNE2')) for si,p in enumerate(kpts['pts']): s = self.S[si] w.write("%-50s %10f %10f\n" % (s.name,p[0],p[1])) w.close() dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,t_runs,dim_comps=[(0,1),(0,1),(0,1),(0,1)],NAMES=False).finish(self.plt_name+'_tsne') dimplot = dplot.dimplot(R.args,R.progress).add_data(self.S,t_runs,dim_comps=[(0,1),(0,1),(0,1),(0,1)],NAMEOUTLIERS=True).finish(self.plt_name+'_exnamed_tsne') R.progress.end() sys.exit()
def eval_predictors(self): dim = rage_DR.DR(self.options, self.progress) #.set_fit_matrix(self.D.matrix('log')) predictor_plot = rage_regression_plots.predictor_plot( self.options, len(self.rage.args.predictors)) # reg_out = rage_outputs.regression_output(self.options,M_full) # pred_out = rage_outputs.predictor_output(self.options) for p in self.rage.args.predictors: self.D = copy.deepcopy(self.rage.data) self.D.rage.progress.reset() #self.D.filter_samples_by_attributes([p],[]).normalize() #self.V = self.D.set_sample_variables([p]) self.D.filter_samples_by_attributes( [p], self.options.covariates).normalize() self.V = self.D.set_sample_variables([p], self.options.covariates) self.Y = [[s.cnts[f.idx] for s in self.D.samples] for f in self.D.features] self.X = self.V.select_variables(self.V.variables) self.options.color = [p] self.D.samples.create_plot_labels(self.options) self.progress.start_minor('Running Predictor Regression: ' + p, len(self.D.features), False) self.progress.mark() M = rrm.RegModel(self.options, self.X, True).run(self.Y).aggregate(True) pca_init = dim.pca(self.D.matrix(), req='brief') #['total_var'] pca_resid = dim.pca(np.matrix(M.out['resids']).getT(), req='brief') #@['total_var'] sims = dd(list) SUMMARIZE = True preds = [sp for sp in M.pv_dict.keys() if sp != 'intercept'] gt_key = dd(lambda: dd(int)) best_key = dd(lambda: dd(float)) for n in range(self.options.simulations): self.progress.start_minor('Running Simulation ' + str(n + 1), False) Ms = rrm.RegModel( self.options, self.V.select_variables(self.V.variables, permute=[p])).run( self.Y).aggregate(True) sims['var'].append( dim.pca(np.matrix(Ms.out['resids']).getT(), req='brief')['total_var']) sims['pv'].append(Ms.pv_cnt) sims['rs'].append(Ms.rs_cnt) if SUMMARIZE: for i, (f, Yi) in enumerate(zip(self.D.features, self.Y)): rsq, rsa = Ms.out['rsq'][i], Ms.out['rsa'][i] if rsq > M.out['rsq'][i]: gt_key[i]['rsq'] += 1 if rsa > M.out['rsa'][i]: gt_key[i]['rsa'] += 1 if rsa > best_key[i]['rsa']: best_key[i]['rsa'] = rsa if rsq > best_key[i]['rsq']: best_key[i]['rsq'] = rsq for sp in M.pv_dict.keys(): spV = Ms.pv_dict[sp][i] if spV < M.pv_dict[sp][i]: gt_key[i][sp] += 1 if spV < best_key[i][sp]: best_key[i][sp] = spV pred_out = rage_outputs.predictor_output(self.options, p, M, self.D.features, self.Y) pred_out.add_sim_keys(gt_key, best_key, self.options.simulations) predictor_plot.add_predictor_row(p, self.D.samples, pca_init, pca_resid, M, sims) self.progress.end() predictor_plot.save(self.options.prefix + "-predictorplot-" + "_".join(self.rage.args.predictors) + '-cov-' + '_'.join(self.rage.args.covariates)) sys.exit()
def evaluate_model(self): self.progress.start_minor('Running Model Regressions', len(self.D.features), False) for dist in self.dists: M = rrm.RegModel(self.X, dist, self.options, self.progress, True).run(self.Y, self.feature_names).aggregate(True) M_resids, C_resids = M.get_resids() Mc = rrm.RegModel(self.Xc, dist, self.options).run( self.Y, self.feature_names).aggregate(True) sims = dd(list) self.progress.start_minor('Running Model PCA', len(self.D.features), False) dim = rage_DR.DR( self.options, self.progress) #.set_fit_matrix(self.D.matrix('log')) pca_init = dim.set_y_matrix(self.Y, LOG_TRANSFORM=True, SCALE=True).pca(req='brief') pca_c_resid = dim.set_y_matrix(C_resids, LOG_TRANSFORM=dist[-3::] != 'LOG', SCALE=True).pca(req='brief') pca_resid = dim.set_y_matrix(M_resids, LOG_TRANSFORM=dist[-3::] != 'LOG', SCALE=True).pca(req='brief') for n in range(self.options.simulations): self.progress.start_major('Running Simulation ' + str(n + 1), False) Xs = self.V.select_variables(self.V.variables, permute=self.V.predictors) Xs.zp = self.X.zp Ms = rrm.RegModel(Xs, dist, self.options, self.progress).run( self.Y, self.feature_names).aggregate() S_out = rage_regression_outputs.eval_output( self.options).write(Ms, self.feature_names, n + 1) sims['pv'].append(Ms.pv_cnt) sims['rs'].append(Ms.rs_cnt) sims['v_exp'].append(np.mean(Ms.out['v_exp'])) self.progress.start_minor('Plotting Results ', 100, False) mplot = rage_regression_plots.model_plot(self.D.samples, self.X, self.options, 3, 2, { 'p_key': M.pv_key, 'r_key': M.rs_key }) mplot.add_model_table(M, total_var=[ np.mean(Mc.out['v_exp']), np.mean(M.out['v_exp']), np.mean(sims['v_exp']) ]).update() mplot.add_predictor_table(M, self.X, self.options, { 'sim_pvs': sims['pv'] }).update() mplot.add_rs_bars(M.rs_cnt, self.options, sims['rs']).update({ 'title': '$' + "\ ".join(self.V.predictors) + '$ ' + '$\ R^2\ Values$' }) mplot.add_pv_bars(M.pv_cnt, self.options, sims['pv']).update({ 'title': '$' + "\ ".join(self.V.predictors) + '$ ' + '$\ P\ \ Values$' }) mplot.add_pca_pts(pca_init, { 'colspan': 2 }).update({ 'title': 'PCA Initial Values', 'yadd': 2, 'colspan': 2 }) mplot.add_pca_pts(pca_c_resid, { 'colspan': 2 }).update({ 'title': 'PCA Covariate Residuals', 'yadd': 2, 'colspan': 2 }) mplot.add_pca_pts(pca_resid, { 'colspan': 2 }).update({ 'title': 'PCA Model Residuals', 'yadd': 2, 'colspan': 2 }) mplot.save(dist, self.options.predictors, self.options.covariates) rage_regression_outputs.reg_simulate(self.options).write( M.pv_cnt, sims['pv'], self.options.simulations, self.V.predictors) self.progress.end()
def run(self): R = self.rage # R.data.filter_samples_by_attributes().normalize() if R.args.command == 'samples': R.progress.start_major('SampleSummary') R.data.samples.create_plot_labels(R.args) if R.args.pca or R.args.tsne: R.progress.start_minor('Performing Dimensional Reduction', len(R.data.samples)) dim = rage_DR.DR(R.args, R.progress).set_fit_matrix( R.data.matrix('log')) pca = dim.pca() if R.args.tsne: dim_plot = rage_scatterplots.DimR( R.args, R.progress, 1, 2).add_dim_run(pca, R.data.samples).add_dim_run( dim.tsne(), R.data.samples).save() else: dim_plot = rage_scatterplots.DimR( R.args, R.progress).add_dim_run(pca, R.data.samples).save() rage_outputs.column_coefs(R.args).write( pca['coefs'], R.data.features, { 'suffix': 'PCAcoeffs.features.out', 'width': 15 }) rage_outputs.dr_pts(R.args).write(pca['pts'], R.data.samples, {'suffix': 'pca.pts.out'}) R.progress.start_minor('Calculating Summary Stats', len(R.data.samples)) sample_stats = summary_hists(R.data.samples, R.data.features, R.args, R.progress) rage_outputs.column_stats(R.args).write( sample_stats, R.data.samples, { 'suffix': 'samplestats', 'width': 15 }) sample_trends = summary_trends(R.data.samples, R.data.features, R.args, R.progress) elif R.args.command == 'features': R.progress.start_major('FeatureSummary') R.data.features.create_plot_labels(R.args) if R.args.pca or R.args.tsne: R.progress.start_minor('Performing Dimensional Reduction', len(R.data.features)) dim = rage_DR.DR(R.args, R.progress).set_fit_matrix( R.data.matrix('log', TRANSPOSE=True)) pca = dim.pca() if R.args.tsne: dim_plot = rage_scatterplots.DimR( R.args, R.progress, 1, 2).add_dim_run(pca, R.data.features).add_dim_run( dim.tsne(), R.data.features).save() else: dim_plot = rage_scatterplots.DimR( R.args, R.progress).add_dim_run(pca, R.data.features).save() rage_outputs.column_coefs(R.args).write( pca['coefs'], R.data.features, { 'suffix': 'PCAcoeffs.features.out', 'width': 15 }) rage_outputs.dr_pts(R.args).write(pca['pts'], R.data.samples, {'suffix': 'pca.pts.out'}) R.progress.start_minor('Calculating Summary Stats', len(R.data.samples)) feature_stats = summary_hists(R.data.features, R.data.samples, R.args, R.progress) rage_outputs.column_stats(R.args).write( feature_stats, R.data.features, { 'suffix': 'featurestats.out', 'width': 15 }) feature_trends = summary_trends(R.data.features, R.data.samples, R.args, R.progress) elif R.args.command == 'ratios': feature_comps = rage_comps.features(self.rage).get_f_ratios() HOUSEKEEPING, r_key = feature_comps.HOUSEKEEPING, feature_comps.r_key feature_comps.predict_known_ratio_values()
def summary_dists(X, Y, options, progress, X_NAME='SAMPLES'): seaborn.set(rc={ 'axes.facecolor': 'lightpink', 'figure.facecolor': 'lightgray' }) progress.start_major('Plotting Distribution Densities', len(X)) kde = rage_KDE.samples(0.3) f_num, subplot = 1, rage_subplots.subplot(6, 2, options) LOG = True dr = rage_DR.DR(options, progress) y_vals = scale_vals([log((1.0 + sum(y.cnts.values()))) for y in Y]) x1, y1 = kde.run(y_vals) subplot.add_lines(x1, y1, None, None, 'black').update({'title': 'Global Distribution'}) iter_data = [] for x in X: progress.mark() non_zeros = scale_vals([log(v + 1.0) for v in x.cnts.values()] + [0.0]) all_vals = scale_vals([0 for v in range(Y.len - (1 + len(non_zeros)))] + [log(v + 1.0) for v in x.cnts.values()]) nz = simsample_items(non_zeros) sz = simsample_items(all_vals) x.notes['iter'] = [non_zeros, all_vals] iter_data.append([non_zeros, all_vals, nz, sz]) r_matrix = np.matrix([it[2] for it in iter_data]) pca_run = dr.run_pca(r_matrix) kmean_run = dr.run_kmeans(r_matrix) subplot.add_pca_data(pca_run['pts'], {'title': 'PCA on binned distribution values' }) #.update({'clear_axes': True}) for i in range(len(kmean_run['labels'][0])): X[i].notes['km'] = kmean_run['labels'][0][i] if X[i].notes['km'] == 0: subplot.ax.scatter(pca_run['pts'][i][0], pca_run['pts'][i][1], color='yellow') subplot.update({'clear_axes': True}) X0, X1 = [x for x in X if x.notes['km'] == 0], [x for x in X if x.notes['km'] == 1] for x1, x2 in zip(X0, X1): if x2.name in ['EB321', 'EB1015']: continue nz1, az1 = x1.notes['iter'] nz2, az2 = x2.notes['iter'] a1, b1 = kde.run(az1) a2, b2 = kde.run(nz1) subplot.add_lines(a1, b1, None, None, 'black') subplot.add_lines(a2, b2, None, None, 'cyan') subplot.update({'clear_axes': True, 'title': x1.name}) a1, b1 = kde.run(az2) a2, b2 = kde.run(nz2) subplot.add_lines(a1, b1, None, None, 'black') subplot.add_lines(a2, b2, None, None, 'cyan') subplot.update({'clear_axes': True, 'title': x2.name}) f_num += 1 if not subplot.update or f_num > 15: break plt.subplots_adjust(left=0.07, bottom=0.01, right=0.93, top=0.95, wspace=0.2, hspace=0.6) subplot.save(options.prefix + 'fig_dists' + str(f_num) + '.png', {'title': 'Dual Dists: '}) progress.end()
def make_dr_plots2(R, choice='samples'): if choice == 'samples': r_members = R.data.samples r_matrix = R.data.matrix('log') out_name = R.args.prefix + '_samples_' else: r_members = R.data.features r_matrix = R.data.matrix('log').getT() out_name = R.args.prefix + '_features_' dr = rage_DR.DR(R.args, R.progress) pca_run = dr.run_pca(r_matrix) dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, pca_run, { 'title': 'PCA', 'out': out_name + 'pca.pdf' }) # tsne_run = dr.run_tsne() # dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,tsne_run,{'title':'TSNE','out': out_name+'tsne.pdf'}) kca_gamma = 20 # kca_run = dr.run_kca(r_matrix,kernel='rbf',gamma=0.001) # dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,kca_run,{'title':'KCA-rbf','out': out_name+'kca-001-rbf.pdf','zoom': True}) # kca_run = dr.run_kca(r_matrix,kernel='rbf',gamma=0.0001) # dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,kca_run,{'title':'KCA-rbf','out': out_name+'kca-0001-rbf.pdf','zoom': True}) # kca_run = dr.run_kca(r_matrix,kernel='rbf',gamma=0.1) # dimplot = dplot.dimplot(2,2,R.args,R.progress).add_data(r_members,kca_run,{'title':'KCA-rbf','out': out_name+'kca-1-rbf.pdf','zoom': True}) kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=0.01) dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_run, { 'title': 'KCA-rbf', 'out': out_name + 'kca-01-rbf.pdf', 'zoom': True }) kca_run2 = dr.run_kca(r_matrix, kernel='rbf', gamma=0.005) dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_run, { 'title': 'KCA-rbf', 'out': out_name + 'kca-005-rbf.pdf', 'zoom': True }) kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=0.05) dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_run, { 'title': 'KCA-rbf', 'out': out_name + 'kca-05-rbf.pdf', 'zoom': True }) kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=0.1) dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_run, { 'title': 'KCA-rbf', 'out': out_name + 'kca-p1-rbf.pdf', 'zoom': True }) kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=1) dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_run, { 'title': 'KCA-rbf', 'out': out_name + 'kca-i1-rbf.pdf', 'zoom': True }) kca_run = dr.run_kca(r_matrix, kernel='rbf', gamma=10) dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_run, { 'title': 'KCA-rbf', 'out': out_name + 'kca-i10-rbf.pdf', 'zoom': True }) kca_lin = dr.run_kca(r_matrix, kernel='linear') dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_lin, { 'title': 'KCA-linear', 'out': out_name + 'kca-line.pdf', 'zoom': True }) try: kca_poly = dr.run_kca(r_matrix, kernel='poly') dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_poly, { 'title': 'KCA-poly', 'out': out_name + 'kca-poly.pdf', 'zoom': True }) except np.linalg.linalg.LinAlgError: kca_poly = None try: kca_sig = dr.run_kca(r_matrix, kernel='sigmoid') dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_sig, { 'title': 'KCA-sig', 'out': out_name + 'kca-sig.pdf', 'zoom': True }) except np.linalg.linalg.LinAlgError: kca_poly = None try: kca_cosine = dr.run_kca(r_matrix, kernel='cosine') dimplot = dplot.dimplot(2, 2, R.args, R.progress).add_data(r_members, kca_cosine, { 'title': 'KCA-cosine', 'out': out_name + 'kca-cos.pdf', 'zoom': True }) except np.linalg.linalg.LinAlgError: kca_poly = None return pca_run, kca_run, kca_lin