def test_nainteger(): vec = robjects.IntVector(range(3)) vec[0] = robjects.NA_Integer assert robjects.baseenv['is.na'](vec)[0] is True
def testNewFromOrdDict(self): od = rlc.OrdDict(c=(('a', robjects.IntVector((1, 2))), ('b', robjects.StrVector(('c', 'd'))))) dataf = robjects.DataFrame(od) self.assertEquals(1, dataf.rx2('a')[0])
def report_performance(trueY, scoreY, n=None): # auc, 95%ci, tpr,tnr,fpr,fnr,ppv,npv,f1, odds ratio, OR 95%ci, P, N, tp, tn, fp, fn loc1 = trueY.isnull() loc2 = scoreY.isnull() locs = np.logical_not(np.logical_or(loc1, loc2)) trueY = trueY[locs] scoreY = scoreY[locs] res = {} proc = importr('pROC') if n is None: ground = ro.vectors.IntVector(trueY) score = ro.vectors.FloatVector(scoreY) #[:,1]) else: ground = ro.r.matrix(ro.IntVector(trueY), ncol=n) score = ro.r.matrix(ro.FloatVector(scoreY), ncol=n) #[:,1]) roc1 = proc.roc(ground, score, direction='<', ci='True') #roc1.names res['auc'] = roc1.rx2('ci')[1] # auroc res['auc_cilow'] = roc1.rx2('ci')[0] # low ci res['auc_cihigh'] = roc1.rx2('ci')[2] # high ci rocr = importr('ROCR') pre = rocr.prediction(score, ground) pref = rocr.performance(pre, 'sens', 'spec') #tuple(pref.slotnames()) #print(np.array(pref.slots['x.values'][0])) sumsenspe = np.array(pref.slots['x.values'][0]) + np.array( pref.slots['y.values'][0]) #print(sumsenspe) maxloc = np.argmax(sumsenspe) #print(pref.slots['y.values'][0]) res_array = np.array(pref.slots['alpha.values'][0]) tpr = np.array(pref.slots['y.values'][0])[maxloc] #)[0][maxloc] tnr = np.array(pref.slots['x.values'][0])[maxloc] res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0]))) res_array = np.vstack((res_array, np.array(pref.slots['x.values'][0]))) fpr = 1 - tnr fnr = 1 - tpr res['cutoff'] = np.array(pref.slots['alpha.values'][0])[maxloc] res['tpr'] = tpr res['tnr'] = tnr res['fpr'] = fpr res['fnr'] = fnr pref = rocr.performance(pre, 'ppv') res['ppv'] = np.array(pref.slots['y.values'][0])[maxloc] res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0]))) pref = rocr.performance(pre, 'npv') res['npv'] = np.array(pref.slots['y.values'][0])[maxloc] res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0]))) pref = rocr.performance(pre, 'f') res['fscore'] = np.array(pref.slots['y.values'][0])[maxloc] res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0]))) pref = rocr.performance(pre, 'odds') res['odds'] = np.array(pref.slots['y.values'][0])[maxloc] res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0]))).T res_array = pd.DataFrame( res_array, columns=['score', 'TPR', 'TNR', 'PPV', 'NPV', 'F', 'ODDS']) P = np.sum(trueY) N = len(trueY) - P tp = tpr * P tn = tnr * N fp = fpr * N fn = fnr * P res['P'] = P res['N'] = N siglog = np.sqrt(1 / tp + 1 / tn + 1 / fp + 1 / fn) zalph = norm.ppf(0.975) #odds = tp*tn / (fp*fn) #print(odds) logOR = np.log(res['odds']) loglo = logOR - zalph * siglog loghi = logOR + zalph * siglog ORlo = np.exp(loglo) ORhi = np.exp(loghi) res['ORlo'] = ORlo res['ORhi'] = ORhi return res, res_array
# source: # # simple example import rpy2.robjects as robjects pi = robjects.r['pi'] pi[0] robjects.r(''' r <- function(r, verbose=FALSE) { if (verbose) { cat("I am calling off f().\n") } } f(3) ''') # 18.85 # another simple example Letters = robjects.r['letters'] rcode = 'paste(%s, collapse=”-”)' % (letters.r_repr()) print(res) # a more interesting/useful example r = robjects.r x = robjects.IntVector(range(10)) y = r.rnorm(10) r.X11() # for WinTel? r.layout(r.matrix(robjects.IntVector([1, 2, 3, 2]), nrow=2, ncol=2)) r.plot(r.runif(10), y, xlab="runif", ylab="foo/bar", col="red")
def test_takeLogException(self): vector = R.IntVector( (59843, 34982, 12425, 90534, 34532, 54642, 1239, 43534)) self.assertRaises(ValueError, rFunctions.takeLog, vector, -2)
def test_lda_r(cls, feats, cl_sl, boots, fract_sample, lda_th, tol_min, nlogs): fk = feats.keys() means = dict([(k, []) for k in feats.keys()]) feats['class'] = list(cls['class']) clss = list(set(feats['class'])) for uu, k in enumerate(fk): if k == 'class': continue ff = [(feats['class'][i], v) for i, v in enumerate(feats[k])] for c in clss: if len(set([float(v[1]) for v in ff if v[0] == c])) > max( float(feats['class'].count(c)) * 0.5, 4): continue for i, v in enumerate(feats[k]): if feats['class'][i] == c: feats[k][i] = math.fabs(feats[k][i] + lrand.normalvariate( 0.0, max(feats[k][i] * 0.05, 0.01))) rdict = {} for a, b in feats.items(): if a == 'class' or a == 'subclass' or a == 'subject': rdict[a] = robjects.StrVector(b) else: rdict[a] = robjects.FloatVector(b) robjects.globalenv["d"] = robjects.DataFrame(rdict) lfk = len(feats[fk[0]]) rfk = int(float(len(feats[fk[0]])) * fract_sample) f = "class ~ " + fk[0] for k in fk[1:]: f += " + " + k.strip() ncl = len(set(cls['class'])) min_cl = int( float(min([cls['class'].count(c) for c in set(cls['class'])])) * fract_sample * fract_sample * 0.5) min_cl = max(min_cl, 1) pairs = [(a, b) for a in set(cls['class']) for b in set(cls['class']) if a > b] for k in fk: for i in range(boots): means[k].append([]) for i in range(boots): for rtmp in range(1000): rand_s = [lrand.randint(0, lfk - 1) for v in range(rfk)] if not contast_within_classes_or_few_per_class( feats, rand_s, min_cl, ncl): break rand_s = [r + 1 for r in rand_s] means[k][i] = [] for p in pairs: robjects.globalenv["rand_s"] = robjects.IntVector(rand_s) robjects.globalenv["sub_d"] = robjects.r('d[rand_s,]') z = robjects.r('z <- suppressWarnings(lda(as.formula(' + f + '),data=sub_d,tol=' + str(tol_min) + '))') robjects.r('w <- z$scaling[,1]') robjects.r('w.unit <- w/sqrt(sum(w^2))') robjects.r('ss <- sub_d[,-match("class",colnames(sub_d))]') if 'subclass' in feats: robjects.r('ss <- ss[,-match("subclass",colnames(ss))]') if 'subject' in feats: robjects.r('ss <- ss[,-match("subject",colnames(ss))]') robjects.r('xy.matrix <- as.matrix(ss)') robjects.r('LD <- xy.matrix%*%w.unit') robjects.r('effect.size <- abs(mean(LD[sub_d[,"class"]=="' + p[0] + '"]) - mean(LD[sub_d[,"class"]=="' + p[1] + '"]))') scal = robjects.r('wfinal <- w.unit * effect.size') rres = robjects.r('mm <- z$means') rowns = list(rres.rownames) lenc = len(list(rres.colnames)) coeff = [ abs(float(v)) if not math.isnan(float(v)) else 0.0 for v in scal ] res = dict([ (pp, [float(ff) for ff in rres.rx(pp, True)] if pp in rowns else [0.0] * lenc) for pp in [p[0], p[1]] ]) for j, k in enumerate(fk): gm = abs(res[p[0]][j] - res[p[1]][j]) means[k][i].append((gm + coeff[j]) * 0.5) res = {} for k in fk: m = max([ numpy.mean([means[k][kk][p] for kk in range(boots)]) for p in range(len(pairs)) ]) res[k] = math.copysign(1.0, m) * math.log(1.0 + math.fabs(m), 10) return res, dict([(k, x) for k, x in res.items() if math.fabs(x) > lda_th])
robjects.RVector([]), staticmethod(lambda x: isinstance(x, robjects.RVector))) def bool_vector_conv(v): return vector_conv(v, bool) RBoolVector = new_constant('RBoolVector' , staticmethod(bool_vector_conv), robjects.BoolVector([]), staticmethod(lambda x: isinstance(x, robjects.RVector)), base_class=RVector) def int_vector_conv(v): return vector_conv(v, int) RIntVector = new_constant('RIntVector' , staticmethod(int_vector_conv), robjects.IntVector([]), staticmethod(lambda x: isinstance(x, robjects.RVector)), base_class=RVector) def float_vector_conv(v): return vector_conv(v, float) RFloatVector = new_constant('RFloatVector' , staticmethod(float_vector_conv), robjects.FloatVector([]), staticmethod(lambda x: isinstance(x, robjects.RVector)), base_class=RVector) def str_vector_conv(v): return vector_conv(v, str) RStrVector = new_constant('RStrVector' , staticmethod(str_vector_conv),
def test_sample_error(): vec = robjects.IntVector(range(100)) with pytest.raises(ri.embedded.RRuntimeError): spl = vec.sample(110)
def test_sample_replacement(): vec = robjects.IntVector(range(100)) spl = vec.sample(110, replace=True) assert len(spl) == 110
def test_sample_probabilities_novector(): vec = robjects.IntVector(range(100)) spl = vec.sample(10, probabilities=[.01] * 100) assert len(spl) == 10
def test_sample_probabilities_error_len(): vec = robjects.IntVector(range(100)) with pytest.raises(ValueError): vec.sample(10, probabilities=robjects.FloatVector([.01] * 10))
def test_sample(): vec = robjects.IntVector(range(100)) spl = vec.sample(10) assert len(spl) == 10
def test_itemsnonames(): vec = robjects.IntVector(range(3)) names = [k for k, v in vec.items()] assert names == [None, None, None] values = [v for k, v in vec.items()] assert values == [0, 1, 2]
def test_tabulate(): vec = robjects.IntVector((1, 2, 1, 2, 1, 2, 2)) tb = vec.tabulate() assert tuple(tb) == (3, 4)
plt.plot([0, 1], [0, 1], 'k--') # random predictions curve plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) ax.annotate('Avg. accuracy: %0.4f\nAvg. AUROC: %0.4f' % (avg_accuracy / float(r), avg_auc / float(r)), xy=(0.95, 0.1), xytext=(0.65, 0.1)) plt.tight_layout() plt.show() print("Average accuracy:", (avg_accuracy / float(r))) print("Average AUROC:", (avg_auc / float(r))) np.savetxt("rf_count.csv", correct_prediction_count, delimiter=",") X['Stability'] = np.divide(correct_prediction_count, float(r)) np.savetxt("rf_probabilities.csv", probabilities, delimiter=",", fmt="%2.4f") np.savetxt("rf_predictions.csv", predictions, delimiter=",", fmt="%1d") # Reattach ISS16 data to dataframe X['Class'] = y X.to_csv("UQ_rf_results.csv", index=False) # Generate clusters via Ckmeans r_x = ro.FloatVector(X['Stability']) c1_x = ck(r_x, 2) #c2_x = ck(r_x, 3) # Convert clusters to numpy arrays and append to data c2 = np.array(ro.IntVector(c1_x[0])) X['Cluster2'] = c2 # c3 = np.array(ro.IntVector(c2_x[0])) # X['Cluster3'] = c3 X.to_csv('UQ_rf_clusters_results.csv', index=False)
def plot1(moptions, significant_pos, curn): m_signal = [] #deque() #[] m_pos = [] #deque() #[] m_ds = [] #deque() #[] curchr = significant_pos[0][0]; curstrand = significant_pos[0][1]; curpos = significant_pos[0][2]; if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"): mtitle = ("1=%s VS\n 2=%s:\n p-value=%.1E (ks test p=%.1E) at pos %d of %s strand in %s. Rank %d " % (moptions['ds2'][0], moptions['ds2'][1], significant_pos[1][3][1], significant_pos[1][2][1], curpos+1, curstrand, curchr, curn+1)) else: mtitle = ("1=%s VS\n 2=%s:\n p-value=%.1E at pos %d of %s strand in %s. Rank %d " % (moptions['ds2'][0], moptions['ds2'][1], significant_pos[1][2][1], curpos+1, curstrand, curchr, curn+1)) ds0 = moptions[moptions['ds2'][0]] ds1 = moptions[moptions['ds2'][1]] ds2 = [ds0, ds1] sk = (curchr, curstrand) noenough = False; pv3 = {} cur_ind = moptions['sign_test'].index(significant_pos) print significant_pos, cur_ind, curn nearybysize = moptions["window"] if moptions['RegionRankbyST']==1: nearybysize = int(nearybysize*2) #for mind in range(cur_ind-moptions["window"], cur_ind+moptions["window"]+1): for mind in range(cur_ind-nearybysize, cur_ind+nearybysize+1): if pos_check(moptions['sign_test'], cur_ind, mind): #print len(moptions['sign_test']), cur_ind, mind pk = moptions['sign_test'][mind][0][2] pv = moptions['sign_test'][mind][1] pv3[(pk, ds0['base'][sk][pk])] = pv else: noenough = True; if noenough: break; for mds_ind in range(len(ds2)): mna = ds2[mds_ind]['base'][sk][pk] for sg in ds2[mds_ind]['norm_mean'][sk][pk]: m_ds.append("%d" % (mds_ind+1)) if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"): if has_ut==1: m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0][1], pv[1][1],pv[2][1],pv[3][1])) else: m_pos.append('%d/%s\n%.1E\n%.1E' % (pk+1, mna, pv[2][1],pv[3][1])) else: if has_ut==1: m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0][1], pv[1][1],pv[2][1])) else: m_pos.append('%d/%s\n%.1E' % (pk+1, mna, pv[2][1])) m_signal.append(round(sg,3)) #for pk in range(curpos-moptions["window"], curpos+moptions["window"]+1): # pv = None; # if pk==curpos: pv = significant_pos[1] # else: # if ds1['norm_mean'].has_key(sk) and ds1['norm_mean'][sk].has_key(pk) and ds0['norm_mean'].has_key(sk) and ds0['norm_mean'][sk].has_key(pk): # pv = getUtest(ds0['norm_mean'][sk][pk], ds1['norm_mean'][sk][pk]) # if pv==None: # noenough = True; # else: # cur_comb_pv = get_fisher_comb_pvalues(moptions, significant_pos) # if not cur_comb_pv==None: # pv.append(cur_comb_pv) # pv3[(pk, ds0['base'][sk][pk])] = pv # if noenough: break; # # for mds_ind in range(len(ds2)): # mna = ds2[mds_ind]['base'][sk][pk] # for sg in ds2[mds_ind]['norm_mean'][sk][pk]: # m_ds.append("%d" % (mds_ind+1)) # if moptions["neighborPvalues"]>0: # m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0], pv[1],pv[2],pv[3])) # else: # m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0], pv[1],pv[2])) # m_signal.append(round(sg,3)) if not noenough: closesize = moptions["neighborPvalues"]*2 if moptions['RegionRankbyST']==1: closesize = moptions["window"] if closesize<1: closesize = 1 #if significant_pos[0][1]=='-' and 3072-moptions["neighborPvalues"]*3<=significant_pos[0][2]<=3072+moptions["neighborPvalues"]*3: if significant_pos[0][1]=='-' and 3072-closesize<significant_pos[0][2]<3072+closesize: print 'Rank', curn+1, moptions["testMethod"], moptions["FileID"], significant_pos[0][0], significant_pos[0][1], significant_pos[0][2]+1, significant_pos[0][3] #poskeys = deque(); pvsp3 = [deque(), deque(), deque()] poskeys = []; pvsp3 = [[], [], [], []] #print 'pvsp3', pvsp3 pv3keys = pv3.keys(); pv3keys.sort() for pv3k in pv3keys: if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"): print ('%d/%s' % (pv3k[0]+1, pv3k[1])), ('u=%.3E(%.3E) t=%.3E(%.3E) ks=%.3E(%.3E) pv5=%.3E(%.3E)' % (pv3[pv3k][0][1],pv3[pv3k][0][0], pv3[pv3k][1][1],pv3[pv3k][1][0], pv3[pv3k][2][1],pv3[pv3k][2][0], pv3[pv3k][3][1],pv3[pv3k][3][0])) else: print ('%d/%s' % (pv3k[0]+1, pv3k[1])), ('u=%.3E(%.3E) t=%.3E(%.3E) ks=%.3E(%.3E)' % (pv3[pv3k][0][1],pv3[pv3k][0][0], pv3[pv3k][1][1],pv3[pv3k][1][0], pv3[pv3k][2][1],pv3[pv3k][2][0])) poskeys.append('%d/%s' % (pv3k[0]+1, pv3k[1])) #pvsp3[0].append(pv3[pv3k][0]) #pvsp3[1].append(pv3[pv3k][1]) #pvsp3[2].append(pv3[pv3k][2]) pvsp3[0].append(round(math.log10(pv3[pv3k][0][1]), 3)) pvsp3[1].append(round(math.log10(pv3[pv3k][1][1]), 3)) pvsp3[2].append(round(math.log10(pv3[pv3k][2][1]), 3)) if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"): pvsp3[3].append(round(math.log10(pv3[pv3k][3][1]), 3)) print '' stu = {"Position":robjects.StrVector(poskeys), "Pvalue":robjects.FloatVector(pvsp3[0])}; stru = robjects.DataFrame(stu) stt = {"Position":robjects.StrVector(poskeys), "Pvalue":robjects.FloatVector(pvsp3[1])}; strt = robjects.DataFrame(stt) stks ={"Position":robjects.StrVector(poskeys), "Pvalue":robjects.FloatVector(pvsp3[2])}; strks= robjects.DataFrame(stks) if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"): stcb ={"Position":robjects.StrVector(poskeys), "Pvalue":robjects.FloatVector(pvsp3[3])}; else: stcb ={"Position":robjects.StrVector([]), "Pvalue":robjects.FloatVector(pvsp3[3])}; strcb= robjects.DataFrame(stcb) pydf = {"Signal":robjects.FloatVector(m_signal), "Position":robjects.StrVector(m_pos), "DS":robjects.FactorVector(robjects.StrVector(m_ds))} plotDat = robjects.DataFrame(pydf) mrtitle = robjects.StrVector([mtitle]) mhasbox = robjects.IntVector([has_boxplot]) mplotType = robjects.StrVector([moptions['plotType']]) sys.stdout.flush() robjects.globalenv['Base_Most_Significant_Plot'](plotDat, stru, strt, strks, strcb, mrtitle, mhasbox, mplotType) return noenough
data[w][1] <= t2 or 16 - data[w][1] <= t2) ] B = [ w for w in data if (data[w][0] <= t1 or 16 - data[w][0] <= t1) and ( data[w][1] > t2 or 16 - data[w][1] > t2) ] C = [ w for w in data if (data[w][0] > t1 or 16 - data[w][0] > t1) and ( data[w][1] <= t2 or 16 - data[w][1] <= t2) ] D = [ w for w in data if (data[w][0] > t1 or 16 - data[w][0] > t1) and ( data[w][1] > t2 or 16 - data[w][1] > t2) ] d = matrix(R.IntVector([len(A), len(C), len(B), len(D)]), nrow=2) result = fisher_test(d) if test == 't': print >> stderr, "\t".join( map(str, [ p1, p2, t1, t2, len(A), len(C), len(B), len(D), len(A) + len(B) + len(C) + len(D), result[2][0], result[0][0] ])) elif test == 'w': print >> stderr, "\t".join( map(str, [
mirnas.append(final_mirnas[0][i][0]) for k, j in zip( clinical_and_files, final_mirnas ): ## These lists contain the clinical information and miRNA data in the same order. kaplan.append([k[1], k[2], k[3], k[4], k[5], j[i][1]]) data = [ ii[-1] for ii in kaplan ] ## Grabbing all the mirna values for the current mirna being analyzed ro.globalenv['expression'] = ro.FloatVector(data) res = ro.r( 'round(qnorm((rank(expression, na.last="keep")-0.5)/sum(!is.na(expression))), digit=5)' ) ## Perform inverse normal transformation inverse_norm = list(res) ## Convert robject to python list ## Prepare the variables for rpy2 ro.globalenv['mirna'] = ro.FloatVector(inverse_norm) ro.globalenv['times'] = ro.IntVector([ii[0] for ii in kaplan]) ro.globalenv['died'] = ro.IntVector([death_dic[ii[1]] for ii in kaplan]) ##ductal ductal = [] for ii in kaplan: if ii[2] == 1: ductal.append(1) else: ductal.append(0) ##metaplastic metaplastic = [] for ii in kaplan: if ii[2] == 3: metaplastic.append(1)
print "Recombine and index" # Recombine the two hemisphere's into one unified whole! # Also check if anyone's missing df_new = df_both["lh"].append(df_both["rh"]) # Redo index df_new.index = range(df_new.shape[0]) # Sort the column print "Sort by Hemi, Cluster, and Stat" import rpy2.robjects as robjects r = robjects.r cluster = robjects.IntVector(df_new.Cluster.tolist()) network = robjects.StrVector(df_new.YeoNetwork.tolist()) stat = robjects.FloatVector(df_new.Stat.tolist()) o = np.array(r.order(cluster, network, stat, decreasing=True)) - 1 df2 = df_new.ix[o, :] ##### print "Combine, Select, Mash" # Combine the aparc, subcortical, and cerebellum cols = [ "Cluster", "Network", "Hemi", "Region", "BA", "x", "y", "z", "Statistic" ] dict3 = {k: [] for k in cols}
def testNAInteger(self): vec = robjects.IntVector(range(3)) vec[0] = robjects.NA_Integer self.assertTrue(robjects.baseenv['is.na'](vec)[0])
'glom': su[3][i], 'Driver': su[4][i], 'Gender': su[5][i], 'n': su[6][i] } ) results = json.loads(toJSON(res)[0]) #Generate a 3d html from the results plot3d = robjects.r('plot3d') writeWebGL = robjects.r('writeWebGL') #Summary comes ordered by reverse score (muscore). However, the hits are based solely on forward score #If we prefer muscore, use hit numbers ('n') of the first few entries and then assign new the hit numbers if not prefer_muscore: h = robjects.IntVector(range(hits + 1)) hit_names = [] for i in range(hits): hit_names.append([e['name'] for e in s if e['n'] == i+1][0]) else: h = robjects.IntVector([e['n'] for e in s[:hits]]) hit_names = [e['name'] for e in s[:hits]] #Reassign the 'n' (hit) value for i, n in enumerate(s): s[i]['n'] = i+1 if db == 'fc': plot3d(res, hits=h, db=fcdps, soma=True) elif db == 'gmr': plot3d(res, hits=h, db=gmrdps, soma=True)
def testRepr(self): vec = robjects.IntVector((1, 2, 3)) s = repr(vec).split('\n') self.assertEqual('[ 1, 2, 3]', s[2])
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = myCommandLine.args['outDir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = myCommandLine.args['prefix'] formula = myCommandLine.args['formula'] print("running DESEQ2 %s" % prefix, file=sys.stderr) # make the quant DF quantDF = pd.read_table(matrix, header=0, sep='\t', index_col=0) df = pandas2ri.py2ri(quantDF) # import formula formulaDF = pd.read_csv(formula,header=0, sep="\t",index_col=0) sampleTable = pandas2ri.py2ri(formulaDF) if "batch" in list(formulaDF): design = Formula("~ batch + condition") else: design = Formula("~ condition") # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') ### RUN DESEQ2 ### R.assign('df', df) R.assign('sampleTable', sampleTable) R.assign('design',design) R('dds <- DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design)') R('dds <- DESeq(dds)') R('name <- grep("condition", resultsNames(dds), value=TRUE)') ### ### # Get Results and shrinkage values res = R('results(dds, name=name)') resLFC = R('lfcShrink(dds, coef=name)') vsd = R('vst(dds,blind=FALSE)') resdf = robjects.r['as.data.frame'](res) reslfc = robjects.r['as.data.frame'](resLFC) dds = R('dds') ### Plotting section ### # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] # get pca data if "batch" in list(formulaDF): pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") else: print(vsd) pcaData = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange data_folder = os.path.join(os.getcwd(), outdir) qcOut = os.path.join(data_folder, "%s_QCplots_%s_v_%s.pdf" % (prefix,group1,group2)) grdevices.pdf(file=qcOut) x = "PC1: %s" % int(percentVar[0]*100) + "%% variance" y = "PC2: %s" % int(percentVar[1]*100) + "%% variance" if "batch" in list(formulaDF): pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() else: pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrhinkage") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() + \ ggplot2.ggtitle("pvalue distribution") hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() data_folder = os.path.join(os.getcwd(), outdir) lfcOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (prefix,group1,group2)) resOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results.tsv" % (prefix,group1,group2)) robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t") robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
def testItemsNoNames(self): vec = robjects.IntVector(range(3)) names = [k for k, v in vec.items()] self.assertEqual([None, None, None], names) values = [v for k, v in vec.items()] self.assertEqual([0, 1, 2], values)
# endpoint endpoint = endpoint_data[:, i] group0 = endpoint == 0 # ensure we have at least 2 samples in each class num_grp0 = sum(group0 == True) num_grp1 = sum(group0 == False) if num_grp0 > 1 and num_grp1 > 1: fl = robjects.FactorVector(endpoint) # factor for R limma robjects.globalenv["description"] = fl fmla = robjects.Formula('~ description + 0') design = rstats.model_matrix(fmla) design.colnames = robjects.StrVector(['Norm','Mut']) # print(design) # robjects.globalenv["design"] = design fit = limma.lmFit(m, design) contMat = robjects.IntVector([-1, 1]) fit2 = limma.contrasts_fit(fit, contMat) fit2 = limma.eBayes(fit2) corrGenes = limma.decideTests(fit2, adjust_method='fdr', p_value=0.01) tT = limma.topTable(fit2, adjust='fdr', sort_by="B", number=ncol, genelist=geneNames) # print(r.head(tT)) # loop through corrGenes, and find the DEGs DEGs = [] for x in xrange(0, ncol): if corrGenes[x] != 0.0: DEGs.append(x) # create expression matrix just containing DEGs DEGs_data = train_data[:, DEGs] numDEGs = DEGs_data.shape[1] # create 2 binary expression matrices; 1. Up-regulated genes; 2. Down-regulated genes # For 1; if z-score > 2, value = 1, 0 otherwise
def testNewIntVector(self): vec = robjects.IntVector([123, 456]) self.assertEqual(123, vec[0]) self.assertEqual(456, vec[1]) self.assertEqual(2, len(vec))
coeffs=[] pvalues=[] genes=[] ##This list tracks the gene names for i in range(len(final_genes[0])): kaplan=[] genes.append(final_genes[0][i][0]) for k,j in zip(clinical_and_files,final_genes): ## These lists contain the clinical information and mRNA data in the same order. kaplan.append([k[1],k[2],k[3],k[4],k[5],j[i][1]]) data=[ii[-1] for ii in kaplan] ## Grabbing all the gene values for the current gene being analyzed ro.globalenv['expression']=ro.FloatVector(data) res=ro.r('round(qnorm((rank(expression, na.last="keep")-0.5)/sum(!is.na(expression))), digit=5)') ## Perform inverse normal transformation inverse_norm=list(res) ## Convert robject to python list ## Prepare the variables for rpy2 ro.globalenv['gene']=ro.FloatVector(inverse_norm) ro.globalenv['times']=ro.IntVector([ii[0] for ii in kaplan]) ro.globalenv['died']=ro.IntVector([death_dic[ii[1]] for ii in kaplan]) ##grade1 grade1=[] for ii in kaplan: if ii[2]==1: grade1.append(1) else: grade1.append(0) ##grade2 grade2=[] for ii in kaplan: if ii[2]==2: grade2.append(1)
model_1 = robjects.r['auto.arima'](env['freq_tweet'], trace=True) b_1 = robjects.r['LjungBoxTest'](robjects.r['residuals'](model_1), k = 1) rpackages.importr('FitARMA') robjects.r['checkresiduals'](model_1) def get_dict_r(list_r): return dict(zip(list_r.names,list(list_r))) model_2_1 = robjects.r['Arima'](env['freq_tweet'], order = robjects.IntVector([0,1,2]), seasonal = robjects.r['list'](order = robjects.IntVector([0,0,1]), period = 7), ) b_2_1 = robjects.r['LjungBoxTest'](robjects.r['residuals'](model_2_1), k=1) print(b_2_1) print(get_dict_r(model_2)['call']) model_2_2 = robjects.r['Arima'](env['freq_tweet'], order = robjects.IntVector([0,1,2]), seasonal = robjects.r['list'](order = robjects.IntVector([1,0,0]), period = 7), ) b_2_2 = robjects.r['LjungBoxTest'](robjects.r['residuals'](model_2_2))
clinical_and_files.append(i) ##print average age at diagnosis age=np.mean([i[5] for i in clinical_and_files]) ##print number of males males=len([i for i in clinical_and_files if i[4]==0]) ##print number of females females=len([i for i in clinical_and_files if i[4]==1]) ##to get the median survival we need to call survfit from r ##prepare variables for R ro.globalenv['times']=ro.IntVector([i[1] for i in clinical_and_files]) ##need to create a dummy variable group ro.globalenv['group']=ro.IntVector([0 for i in clinical_and_files]) ##need a vector for deaths death_dic={} death_dic['Alive']=0 death_dic['Dead']=1 ro.globalenv['died']=ro.IntVector([death_dic[i[2]] for i in clinical_and_files]) res=ro.r('survfit(Surv(times,died) ~ as.factor(group))') #the number of events(deaths) is the fourth column of the output deaths=str(res).split('\n')[-2].strip().split()[3]
def train_with_blockcluster( dataset_file, graph, nb_row_clusters, nb_column_clusters, row_clusters_index, column_clusters_index, ): results_files_already_done = glob.glob(results_folder + "*.pkl") if (results_folder + dataset_file.split("/")[-1].split(".")[0] + "_bc.pkl" in results_files_already_done): print("Already Done") return None print("BlockCluster :") # Convert sparse matrix to R matrix. B = graph.todense() nr, nc = B.shape Br = ro.r.matrix(B, nrow=nr, ncol=nc) # initmethod Method to initialize model parameters. The valid values are "cemInitStep", "emInitStep" and "randomInit" # nbiterationsxem : Number of EM iterations used during xem step. Default value is 50. # nbinitmax : Maximal number initialization to try. Default value is 100 # nbinititerations : Number of Global iterations used in initialization step. Default value is 10. # initepsilon : Tolerance value used while initialization. Default value is 1e-2. # nbxem : Number of xem steps. Default value is 5. strategy = blockcluster.coclusterStrategy( initmethod="randomInit", nbinitmax=100, nbinititerations=10, nbiterationsXEM=5000, nbiterationsxem=10, initepsilon=1e-2, epsilonxem=1e-4, epsilonXEM=1e-10, stopcriteria="Likelihood", nbtry=1, nbxem=100, ) start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF) results = blockcluster.cocluster( Br, "binary", nbcocluster=robjects.IntVector([nb_row_clusters, nb_column_clusters]), nbCore=1, strategy=strategy, ) end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp() print(end_time - start_time) rowclass = np.array(results.slots["rowclass"]) colclass = np.array(results.slots["colclass"]) icl = results.slots["ICLvalue"][0] co_ari = CARI(row_clusters_index, column_clusters_index, rowclass, colclass) """Return `real`, `sys` and `user` elapsed time, like UNIX's command `time` You can calculate the amount of used CPU-time used by summing `user` and `sys`. `real` is just like the wall clock. """ results = { "lib": "blockcluster", "n1": graph.shape[0], "n2": graph.shape[1], "nq": nb_row_clusters, "nl": nb_column_clusters, "dataset_file": dataset_file, "icl": icl, "cari": co_ari, "real": end_time - start_time, "sys": end_resources.ru_stime - start_resources.ru_stime, "user": end_resources.ru_utime - start_resources.ru_utime, } print(f'BlockCluster tt time {results["user"]+results["sys"]}') pickle.dump( results, open( results_folder + dataset_file.split("/")[-1].split(".")[0] + "_bc.pkl", "wb", ), ) return results