Esempio n. 1
0
def test_nainteger():
    vec = robjects.IntVector(range(3))
    vec[0] = robjects.NA_Integer
    assert robjects.baseenv['is.na'](vec)[0] is True
Esempio n. 2
0
 def testNewFromOrdDict(self):
     od = rlc.OrdDict(c=(('a', robjects.IntVector((1, 2))),
                         ('b', robjects.StrVector(('c', 'd')))))
     dataf = robjects.DataFrame(od)
     self.assertEquals(1, dataf.rx2('a')[0])
Esempio n. 3
0
def report_performance(trueY, scoreY, n=None):
    # auc, 95%ci, tpr,tnr,fpr,fnr,ppv,npv,f1, odds ratio, OR 95%ci, P, N, tp, tn, fp, fn
    loc1 = trueY.isnull()
    loc2 = scoreY.isnull()
    locs = np.logical_not(np.logical_or(loc1, loc2))
    trueY = trueY[locs]
    scoreY = scoreY[locs]
    res = {}
    proc = importr('pROC')
    if n is None:
        ground = ro.vectors.IntVector(trueY)
        score = ro.vectors.FloatVector(scoreY)  #[:,1])
    else:
        ground = ro.r.matrix(ro.IntVector(trueY), ncol=n)
        score = ro.r.matrix(ro.FloatVector(scoreY), ncol=n)  #[:,1])
    roc1 = proc.roc(ground, score, direction='<', ci='True')  #roc1.names
    res['auc'] = roc1.rx2('ci')[1]  # auroc
    res['auc_cilow'] = roc1.rx2('ci')[0]  # low ci
    res['auc_cihigh'] = roc1.rx2('ci')[2]  # high ci

    rocr = importr('ROCR')
    pre = rocr.prediction(score, ground)
    pref = rocr.performance(pre, 'sens', 'spec')  #tuple(pref.slotnames())
    #print(np.array(pref.slots['x.values'][0]))
    sumsenspe = np.array(pref.slots['x.values'][0]) + np.array(
        pref.slots['y.values'][0])
    #print(sumsenspe)
    maxloc = np.argmax(sumsenspe)
    #print(pref.slots['y.values'][0])
    res_array = np.array(pref.slots['alpha.values'][0])
    tpr = np.array(pref.slots['y.values'][0])[maxloc]  #)[0][maxloc]
    tnr = np.array(pref.slots['x.values'][0])[maxloc]
    res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0])))
    res_array = np.vstack((res_array, np.array(pref.slots['x.values'][0])))
    fpr = 1 - tnr
    fnr = 1 - tpr
    res['cutoff'] = np.array(pref.slots['alpha.values'][0])[maxloc]
    res['tpr'] = tpr
    res['tnr'] = tnr
    res['fpr'] = fpr
    res['fnr'] = fnr
    pref = rocr.performance(pre, 'ppv')
    res['ppv'] = np.array(pref.slots['y.values'][0])[maxloc]
    res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0])))
    pref = rocr.performance(pre, 'npv')
    res['npv'] = np.array(pref.slots['y.values'][0])[maxloc]
    res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0])))
    pref = rocr.performance(pre, 'f')
    res['fscore'] = np.array(pref.slots['y.values'][0])[maxloc]
    res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0])))
    pref = rocr.performance(pre, 'odds')
    res['odds'] = np.array(pref.slots['y.values'][0])[maxloc]
    res_array = np.vstack((res_array, np.array(pref.slots['y.values'][0]))).T
    res_array = pd.DataFrame(
        res_array, columns=['score', 'TPR', 'TNR', 'PPV', 'NPV', 'F', 'ODDS'])
    P = np.sum(trueY)
    N = len(trueY) - P
    tp = tpr * P
    tn = tnr * N
    fp = fpr * N
    fn = fnr * P
    res['P'] = P
    res['N'] = N
    siglog = np.sqrt(1 / tp + 1 / tn + 1 / fp + 1 / fn)
    zalph = norm.ppf(0.975)
    #odds = tp*tn / (fp*fn)
    #print(odds)
    logOR = np.log(res['odds'])
    loglo = logOR - zalph * siglog
    loghi = logOR + zalph * siglog
    ORlo = np.exp(loglo)
    ORhi = np.exp(loghi)
    res['ORlo'] = ORlo
    res['ORhi'] = ORhi
    return res, res_array
Esempio n. 4
0
# source:
#
# simple example
import rpy2.robjects as robjects

pi = robjects.r['pi']
pi[0]
robjects.r('''
    r <- function(r, verbose=FALSE) {
        if (verbose) {
          cat("I am calling off f().\n")
        }
      }
    f(3)
  ''')
# 18.85

# another simple example
Letters = robjects.r['letters']
rcode = 'paste(%s, collapse=”-”)' % (letters.r_repr())
print(res)

# a more interesting/useful example

r = robjects.r
x = robjects.IntVector(range(10))
y = r.rnorm(10)
r.X11()  # for WinTel?
r.layout(r.matrix(robjects.IntVector([1, 2, 3, 2]), nrow=2, ncol=2))
r.plot(r.runif(10), y, xlab="runif", ylab="foo/bar", col="red")
Esempio n. 5
0
 def test_takeLogException(self):
     vector = R.IntVector(
         (59843, 34982, 12425, 90534, 34532, 54642, 1239, 43534))
     self.assertRaises(ValueError, rFunctions.takeLog, vector, -2)
Esempio n. 6
0
def test_lda_r(cls, feats, cl_sl, boots, fract_sample, lda_th, tol_min, nlogs):
    fk = feats.keys()
    means = dict([(k, []) for k in feats.keys()])
    feats['class'] = list(cls['class'])
    clss = list(set(feats['class']))
    for uu, k in enumerate(fk):
        if k == 'class': continue
        ff = [(feats['class'][i], v) for i, v in enumerate(feats[k])]
        for c in clss:
            if len(set([float(v[1]) for v in ff if v[0] == c])) > max(
                    float(feats['class'].count(c)) * 0.5, 4):
                continue
            for i, v in enumerate(feats[k]):
                if feats['class'][i] == c:
                    feats[k][i] = math.fabs(feats[k][i] + lrand.normalvariate(
                        0.0, max(feats[k][i] * 0.05, 0.01)))
    rdict = {}
    for a, b in feats.items():
        if a == 'class' or a == 'subclass' or a == 'subject':
            rdict[a] = robjects.StrVector(b)
        else:
            rdict[a] = robjects.FloatVector(b)
    robjects.globalenv["d"] = robjects.DataFrame(rdict)
    lfk = len(feats[fk[0]])
    rfk = int(float(len(feats[fk[0]])) * fract_sample)
    f = "class ~ " + fk[0]
    for k in fk[1:]:
        f += " + " + k.strip()
    ncl = len(set(cls['class']))
    min_cl = int(
        float(min([cls['class'].count(c) for c in set(cls['class'])])) *
        fract_sample * fract_sample * 0.5)
    min_cl = max(min_cl, 1)
    pairs = [(a, b) for a in set(cls['class']) for b in set(cls['class'])
             if a > b]

    for k in fk:
        for i in range(boots):
            means[k].append([])
    for i in range(boots):
        for rtmp in range(1000):
            rand_s = [lrand.randint(0, lfk - 1) for v in range(rfk)]
            if not contast_within_classes_or_few_per_class(
                    feats, rand_s, min_cl, ncl):
                break
        rand_s = [r + 1 for r in rand_s]
        means[k][i] = []
        for p in pairs:
            robjects.globalenv["rand_s"] = robjects.IntVector(rand_s)
            robjects.globalenv["sub_d"] = robjects.r('d[rand_s,]')
            z = robjects.r('z <- suppressWarnings(lda(as.formula(' + f +
                           '),data=sub_d,tol=' + str(tol_min) + '))')
            robjects.r('w <- z$scaling[,1]')
            robjects.r('w.unit <- w/sqrt(sum(w^2))')
            robjects.r('ss <- sub_d[,-match("class",colnames(sub_d))]')
            if 'subclass' in feats:
                robjects.r('ss <- ss[,-match("subclass",colnames(ss))]')
            if 'subject' in feats:
                robjects.r('ss <- ss[,-match("subject",colnames(ss))]')
            robjects.r('xy.matrix <- as.matrix(ss)')
            robjects.r('LD <- xy.matrix%*%w.unit')
            robjects.r('effect.size <- abs(mean(LD[sub_d[,"class"]=="' + p[0] +
                       '"]) - mean(LD[sub_d[,"class"]=="' + p[1] + '"]))')
            scal = robjects.r('wfinal <- w.unit * effect.size')
            rres = robjects.r('mm <- z$means')
            rowns = list(rres.rownames)
            lenc = len(list(rres.colnames))
            coeff = [
                abs(float(v)) if not math.isnan(float(v)) else 0.0
                for v in scal
            ]
            res = dict([
                (pp, [float(ff)
                      for ff in rres.rx(pp, True)] if pp in rowns else [0.0] *
                 lenc) for pp in [p[0], p[1]]
            ])
            for j, k in enumerate(fk):
                gm = abs(res[p[0]][j] - res[p[1]][j])
                means[k][i].append((gm + coeff[j]) * 0.5)
    res = {}
    for k in fk:
        m = max([
            numpy.mean([means[k][kk][p] for kk in range(boots)])
            for p in range(len(pairs))
        ])
        res[k] = math.copysign(1.0, m) * math.log(1.0 + math.fabs(m), 10)
    return res, dict([(k, x) for k, x in res.items() if math.fabs(x) > lda_th])
Esempio n. 7
0
                       robjects.RVector([]),
                       staticmethod(lambda x: isinstance(x, robjects.RVector)))

def bool_vector_conv(v):
    return vector_conv(v, bool)

RBoolVector = new_constant('RBoolVector' , staticmethod(bool_vector_conv), 
                            robjects.BoolVector([]),
                            staticmethod(lambda x: isinstance(x, robjects.RVector)),
                            base_class=RVector)
                       
def int_vector_conv(v):
    return vector_conv(v, int)

RIntVector = new_constant('RIntVector' , staticmethod(int_vector_conv), 
                            robjects.IntVector([]),
                            staticmethod(lambda x: isinstance(x, robjects.RVector)),
                            base_class=RVector)

def float_vector_conv(v):
    return vector_conv(v, float)

RFloatVector = new_constant('RFloatVector' , staticmethod(float_vector_conv), 
                            robjects.FloatVector([]),
                            staticmethod(lambda x: isinstance(x, robjects.RVector)),
                            base_class=RVector)
                       
def str_vector_conv(v):
    return vector_conv(v, str)

RStrVector = new_constant('RStrVector' , staticmethod(str_vector_conv), 
Esempio n. 8
0
def test_sample_error():
    vec = robjects.IntVector(range(100))
    with pytest.raises(ri.embedded.RRuntimeError):
        spl = vec.sample(110)
Esempio n. 9
0
def test_sample_replacement():
    vec = robjects.IntVector(range(100))
    spl = vec.sample(110, replace=True)
    assert len(spl) == 110
Esempio n. 10
0
def test_sample_probabilities_novector():
    vec = robjects.IntVector(range(100))
    spl = vec.sample(10, probabilities=[.01] * 100)
    assert len(spl) == 10
Esempio n. 11
0
def test_sample_probabilities_error_len():
    vec = robjects.IntVector(range(100))
    with pytest.raises(ValueError):
        vec.sample(10, probabilities=robjects.FloatVector([.01] * 10))
Esempio n. 12
0
def test_sample():
    vec = robjects.IntVector(range(100))
    spl = vec.sample(10)
    assert len(spl) == 10
Esempio n. 13
0
def test_itemsnonames():
    vec = robjects.IntVector(range(3))
    names = [k for k, v in vec.items()]
    assert names == [None, None, None]
    values = [v for k, v in vec.items()]
    assert values == [0, 1, 2]
Esempio n. 14
0
def test_tabulate():
    vec = robjects.IntVector((1, 2, 1, 2, 1, 2, 2))
    tb = vec.tabulate()
    assert tuple(tb) == (3, 4)
Esempio n. 15
0
plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
ax.annotate('Avg. accuracy: %0.4f\nAvg. AUROC:   %0.4f' %
            (avg_accuracy / float(r), avg_auc / float(r)),
            xy=(0.95, 0.1),
            xytext=(0.65, 0.1))
plt.tight_layout()
plt.show()
print("Average accuracy:", (avg_accuracy / float(r)))
print("Average AUROC:", (avg_auc / float(r)))
np.savetxt("rf_count.csv", correct_prediction_count, delimiter=",")
X['Stability'] = np.divide(correct_prediction_count, float(r))
np.savetxt("rf_probabilities.csv", probabilities, delimiter=",", fmt="%2.4f")
np.savetxt("rf_predictions.csv", predictions, delimiter=",", fmt="%1d")
# Reattach ISS16 data to dataframe
X['Class'] = y
X.to_csv("UQ_rf_results.csv", index=False)

# Generate clusters via Ckmeans
r_x = ro.FloatVector(X['Stability'])
c1_x = ck(r_x, 2)
#c2_x = ck(r_x, 3)

# Convert clusters to numpy arrays and append to data
c2 = np.array(ro.IntVector(c1_x[0]))
X['Cluster2'] = c2
# c3 = np.array(ro.IntVector(c2_x[0]))
# X['Cluster3'] = c3
X.to_csv('UQ_rf_clusters_results.csv', index=False)
Esempio n. 16
0
def plot1(moptions, significant_pos, curn):
   m_signal = [] #deque() #[]
   m_pos = [] #deque() #[]
   m_ds = [] #deque() #[]

   curchr = significant_pos[0][0];
   curstrand = significant_pos[0][1];
   curpos = significant_pos[0][2];

   if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"):
      mtitle = ("1=%s VS\n 2=%s:\n p-value=%.1E (ks test p=%.1E) at pos %d of %s strand in %s. Rank %d " % (moptions['ds2'][0], moptions['ds2'][1], significant_pos[1][3][1], significant_pos[1][2][1], curpos+1, curstrand, curchr, curn+1))
   else:
      mtitle = ("1=%s VS\n 2=%s:\n p-value=%.1E at pos %d of %s strand in %s. Rank %d  " % (moptions['ds2'][0], moptions['ds2'][1], significant_pos[1][2][1], curpos+1, curstrand, curchr, curn+1))

   ds0 = moptions[moptions['ds2'][0]]
   ds1 = moptions[moptions['ds2'][1]]

   ds2 = [ds0, ds1]

   sk = (curchr, curstrand)
   noenough = False;
   pv3 = {}
   cur_ind = moptions['sign_test'].index(significant_pos)
   print significant_pos, cur_ind, curn
   nearybysize = moptions["window"]
   if moptions['RegionRankbyST']==1: nearybysize = int(nearybysize*2)
   #for mind in range(cur_ind-moptions["window"], cur_ind+moptions["window"]+1):
   for mind in range(cur_ind-nearybysize, cur_ind+nearybysize+1):
      if pos_check(moptions['sign_test'], cur_ind, mind):
         #print len(moptions['sign_test']), cur_ind, mind
         pk = moptions['sign_test'][mind][0][2]
         pv = moptions['sign_test'][mind][1]
         pv3[(pk, ds0['base'][sk][pk])] = pv
      else:
         noenough = True;
      if noenough: break;
      for mds_ind in range(len(ds2)):
         mna = ds2[mds_ind]['base'][sk][pk]
         for sg in ds2[mds_ind]['norm_mean'][sk][pk]:
            m_ds.append("%d" % (mds_ind+1))
            if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"):
               if has_ut==1:
                  m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0][1], pv[1][1],pv[2][1],pv[3][1]))
               else:
                  m_pos.append('%d/%s\n%.1E\n%.1E' % (pk+1, mna, pv[2][1],pv[3][1]))
            else:
               if has_ut==1:
                  m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0][1], pv[1][1],pv[2][1]))
               else:
                  m_pos.append('%d/%s\n%.1E' % (pk+1, mna, pv[2][1]))
            m_signal.append(round(sg,3))

   #for pk in range(curpos-moptions["window"], curpos+moptions["window"]+1):
   #   pv = None;
   #   if pk==curpos: pv = significant_pos[1]
   #   else:
   #      if ds1['norm_mean'].has_key(sk) and ds1['norm_mean'][sk].has_key(pk) and ds0['norm_mean'].has_key(sk) and ds0['norm_mean'][sk].has_key(pk):
   #         pv = getUtest(ds0['norm_mean'][sk][pk], ds1['norm_mean'][sk][pk])
   #   if pv==None:
   #      noenough = True;
   #   else:
   #      cur_comb_pv = get_fisher_comb_pvalues(moptions, significant_pos)
   #      if not cur_comb_pv==None:
   #         pv.append(cur_comb_pv)
   #      pv3[(pk, ds0['base'][sk][pk])] = pv
   #   if noenough: break;
   #
   #   for mds_ind in range(len(ds2)):
   #      mna = ds2[mds_ind]['base'][sk][pk]
   #      for sg in ds2[mds_ind]['norm_mean'][sk][pk]:
   #         m_ds.append("%d" % (mds_ind+1))
   #        if moptions["neighborPvalues"]>0:
   #            m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0], pv[1],pv[2],pv[3]))
   #         else:
   #            m_pos.append('%d/%s\n%.1E\n%.1E\n%.1E' % (pk+1, mna, pv[0], pv[1],pv[2]))
   #        m_signal.append(round(sg,3))

   if not noenough:
      closesize = moptions["neighborPvalues"]*2
      if moptions['RegionRankbyST']==1:
         closesize = moptions["window"]
         if closesize<1: closesize = 1

      #if significant_pos[0][1]=='-' and 3072-moptions["neighborPvalues"]*3<=significant_pos[0][2]<=3072+moptions["neighborPvalues"]*3:
      if significant_pos[0][1]=='-' and 3072-closesize<significant_pos[0][2]<3072+closesize:
         print 'Rank', curn+1, moptions["testMethod"], moptions["FileID"], significant_pos[0][0], significant_pos[0][1], significant_pos[0][2]+1, significant_pos[0][3]

      #poskeys = deque(); pvsp3 = [deque(), deque(), deque()]
      poskeys = []; pvsp3 = [[], [], [], []]
      #print 'pvsp3', pvsp3
      pv3keys = pv3.keys(); pv3keys.sort()
      for pv3k in pv3keys:
          if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"):
             print ('%d/%s' % (pv3k[0]+1, pv3k[1])), ('u=%.3E(%.3E) t=%.3E(%.3E) ks=%.3E(%.3E) pv5=%.3E(%.3E)' % (pv3[pv3k][0][1],pv3[pv3k][0][0], pv3[pv3k][1][1],pv3[pv3k][1][0],  pv3[pv3k][2][1],pv3[pv3k][2][0], pv3[pv3k][3][1],pv3[pv3k][3][0]))
          else:
             print ('%d/%s' % (pv3k[0]+1, pv3k[1])), ('u=%.3E(%.3E) t=%.3E(%.3E) ks=%.3E(%.3E)' % (pv3[pv3k][0][1],pv3[pv3k][0][0], pv3[pv3k][1][1],pv3[pv3k][1][0],  pv3[pv3k][2][1],pv3[pv3k][2][0]))
          poskeys.append('%d/%s' % (pv3k[0]+1, pv3k[1]))
          #pvsp3[0].append(pv3[pv3k][0])
          #pvsp3[1].append(pv3[pv3k][1])
          #pvsp3[2].append(pv3[pv3k][2])
          pvsp3[0].append(round(math.log10(pv3[pv3k][0][1]), 3))
          pvsp3[1].append(round(math.log10(pv3[pv3k][1][1]), 3))
          pvsp3[2].append(round(math.log10(pv3[pv3k][2][1]), 3))
          if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"):
             pvsp3[3].append(round(math.log10(pv3[pv3k][3][1]), 3))
      print ''

      stu = {"Position":robjects.StrVector(poskeys), "Pvalue":robjects.FloatVector(pvsp3[0])}; stru = robjects.DataFrame(stu)
      stt = {"Position":robjects.StrVector(poskeys), "Pvalue":robjects.FloatVector(pvsp3[1])}; strt = robjects.DataFrame(stt)
      stks ={"Position":robjects.StrVector(poskeys), "Pvalue":robjects.FloatVector(pvsp3[2])}; strks= robjects.DataFrame(stks)
      if moptions["neighborPvalues"]>0 and (not moptions["testMethod"]=="ks"):
         stcb ={"Position":robjects.StrVector(poskeys), "Pvalue":robjects.FloatVector(pvsp3[3])};
      else:
         stcb ={"Position":robjects.StrVector([]), "Pvalue":robjects.FloatVector(pvsp3[3])};
      strcb= robjects.DataFrame(stcb)

      pydf = {"Signal":robjects.FloatVector(m_signal), "Position":robjects.StrVector(m_pos), "DS":robjects.FactorVector(robjects.StrVector(m_ds))}
      plotDat = robjects.DataFrame(pydf)

      mrtitle = robjects.StrVector([mtitle])
      mhasbox = robjects.IntVector([has_boxplot])
      mplotType = robjects.StrVector([moptions['plotType']])

      sys.stdout.flush()
      robjects.globalenv['Base_Most_Significant_Plot'](plotDat, stru, strt, strks, strcb, mrtitle, mhasbox, mplotType)

   return noenough
Esempio n. 17
0
            data[w][1] <= t2 or 16 - data[w][1] <= t2)
    ]
    B = [
        w for w in data if (data[w][0] <= t1 or 16 - data[w][0] <= t1) and (
            data[w][1] > t2 or 16 - data[w][1] > t2)
    ]
    C = [
        w for w in data if (data[w][0] > t1 or 16 - data[w][0] > t1) and (
            data[w][1] <= t2 or 16 - data[w][1] <= t2)
    ]
    D = [
        w for w in data if (data[w][0] > t1 or 16 - data[w][0] > t1) and (
            data[w][1] > t2 or 16 - data[w][1] > t2)
    ]

d = matrix(R.IntVector([len(A), len(C), len(B), len(D)]), nrow=2)
result = fisher_test(d)

if test == 't':
    print >> stderr, "\t".join(
        map(str, [
            p1, p2, t1, t2,
            len(A),
            len(C),
            len(B),
            len(D),
            len(A) + len(B) + len(C) + len(D), result[2][0], result[0][0]
        ]))
elif test == 'w':
    print >> stderr, "\t".join(
        map(str, [
Esempio n. 18
0
    mirnas.append(final_mirnas[0][i][0])
    for k, j in zip(
            clinical_and_files, final_mirnas
    ):  ## These lists contain the clinical information and miRNA data in the same order.
        kaplan.append([k[1], k[2], k[3], k[4], k[5], j[i][1]])
    data = [
        ii[-1] for ii in kaplan
    ]  ## Grabbing all the mirna values for the current mirna being analyzed
    ro.globalenv['expression'] = ro.FloatVector(data)
    res = ro.r(
        'round(qnorm((rank(expression, na.last="keep")-0.5)/sum(!is.na(expression))), digit=5)'
    )  ## Perform inverse normal transformation
    inverse_norm = list(res)  ## Convert robject to python list
    ## Prepare the variables for rpy2
    ro.globalenv['mirna'] = ro.FloatVector(inverse_norm)
    ro.globalenv['times'] = ro.IntVector([ii[0] for ii in kaplan])
    ro.globalenv['died'] = ro.IntVector([death_dic[ii[1]] for ii in kaplan])

    ##ductal
    ductal = []
    for ii in kaplan:
        if ii[2] == 1:
            ductal.append(1)
        else:
            ductal.append(0)

    ##metaplastic
    metaplastic = []
    for ii in kaplan:
        if ii[2] == 3:
            metaplastic.append(1)
Esempio n. 19
0
print "Recombine and index"

# Recombine the two hemisphere's into one unified whole!
# Also check if anyone's missing
df_new = df_both["lh"].append(df_both["rh"])

# Redo index
df_new.index = range(df_new.shape[0])

# Sort the column
print "Sort by Hemi, Cluster, and Stat"

import rpy2.robjects as robjects
r = robjects.r

cluster = robjects.IntVector(df_new.Cluster.tolist())
network = robjects.StrVector(df_new.YeoNetwork.tolist())
stat = robjects.FloatVector(df_new.Stat.tolist())

o = np.array(r.order(cluster, network, stat, decreasing=True)) - 1
df2 = df_new.ix[o, :]

#####

print "Combine, Select, Mash"

# Combine the aparc, subcortical, and cerebellum
cols = [
    "Cluster", "Network", "Hemi", "Region", "BA", "x", "y", "z", "Statistic"
]
dict3 = {k: [] for k in cols}
Esempio n. 20
0
 def testNAInteger(self):
     vec = robjects.IntVector(range(3))
     vec[0] = robjects.NA_Integer
     self.assertTrue(robjects.baseenv['is.na'](vec)[0])
Esempio n. 21
0
                    'glom': su[3][i],
                    'Driver': su[4][i],
                    'Gender': su[5][i],
                    'n': su[6][i]
                } )

    results = json.loads(toJSON(res)[0])

    #Generate a 3d html from the results
    plot3d = robjects.r('plot3d')
    writeWebGL = robjects.r('writeWebGL')

    #Summary comes ordered by reverse score (muscore). However, the hits are based solely on forward score
    #If we prefer muscore, use hit numbers ('n') of the first few entries and then assign new the hit numbers
    if not prefer_muscore:
        h = robjects.IntVector(range(hits + 1))
        hit_names = []
        for i in range(hits):
            hit_names.append([e['name'] for e in s if e['n'] == i+1][0])
    else:
        h = robjects.IntVector([e['n'] for e in s[:hits]])
        hit_names = [e['name'] for e in s[:hits]]
        #Reassign the 'n' (hit) value
        for i, n in enumerate(s):
            s[i]['n'] = i+1

    if db == 'fc':
        plot3d(res, hits=h, db=fcdps, soma=True)
    elif db == 'gmr':
        plot3d(res, hits=h, db=gmrdps, soma=True)
Esempio n. 22
0
 def testRepr(self):
     vec = robjects.IntVector((1, 2, 3))
     s = repr(vec).split('\n')
     self.assertEqual('[       1,        2,        3]', s[2])
Esempio n. 23
0
File: runDE.py Progetto: wqhf/flair
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir     = myCommandLine.args['outDir']
    group1     = myCommandLine.args['group1']
    group2     = myCommandLine.args['group2']
    batch      = myCommandLine.args['batch']  
    matrix     = myCommandLine.args['matrix']
    prefix     = myCommandLine.args['prefix']
    formula    = myCommandLine.args['formula']

    print("running DESEQ2 %s" % prefix, file=sys.stderr)

    # make the quant DF
    quantDF  = pd.read_table(matrix, header=0, sep='\t', index_col=0)
    df = pandas2ri.py2ri(quantDF)

    # import formula
    formulaDF     = pd.read_csv(formula,header=0, sep="\t",index_col=0)
    sampleTable = pandas2ri.py2ri(formulaDF)


    if "batch" in list(formulaDF):
        design = Formula("~ batch + condition")
    else:
        design = Formula("~ condition")
   

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods   = importr('methods')
    deseq     = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman     = importr('qqman')



    ### RUN DESEQ2 ###
    R.assign('df', df)
    R.assign('sampleTable', sampleTable)
    R.assign('design',design)
    R('dds <- DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design)')
    R('dds <- DESeq(dds)')
    R('name <- grep("condition", resultsNames(dds), value=TRUE)')

    ###
    ###
    # Get Results and shrinkage values
    res    = R('results(dds, name=name)')
    resLFC = R('lfcShrink(dds, coef=name)')
    vsd    = R('vst(dds,blind=FALSE)')
    resdf  = robjects.r['as.data.frame'](res) 
    reslfc = robjects.r['as.data.frame'](resLFC)
    dds    = R('dds')

    
    ### Plotting section ###
    # plot MA and PC stats for the user
    plotMA    = robjects.r['plotMA']
    plotDisp  = robjects.r['plotDispEsts']
    plotPCA   = robjects.r['plotPCA']
    plotQQ    = robjects.r['qq']
    
    # get pca data
    if "batch" in list(formulaDF):
        pcaData    = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    else:
        print(vsd)
        pcaData    = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    # arrange 


    data_folder = os.path.join(os.getcwd(), outdir)
    qcOut = os.path.join(data_folder, "%s_QCplots_%s_v_%s.pdf"  % (prefix,group1,group2))
    
    grdevices.pdf(file=qcOut)

    x = "PC1: %s" % int(percentVar[0]*100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1]*100) + "%% variance"

    if "batch" in list(formulaDF):
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()

    else:
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
    pp.plot()
    plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results")
    plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrhinkage")    
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic() + \
            ggplot2.ggtitle("pvalue distribution")
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()


    data_folder = os.path.join(os.getcwd(), outdir)
    lfcOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results_shrinkage.tsv"  % (prefix,group1,group2))
    resOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results.tsv"  % (prefix,group1,group2))
   
    robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t")
    robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
Esempio n. 24
0
 def testItemsNoNames(self):
     vec = robjects.IntVector(range(3))
     names = [k for k, v in vec.items()]
     self.assertEqual([None, None, None], names)
     values = [v for k, v in vec.items()]
     self.assertEqual([0, 1, 2], values)
Esempio n. 25
0
	# endpoint
	endpoint = endpoint_data[:, i]
	group0 = endpoint == 0
	# ensure we have at least 2 samples in each class
	num_grp0 = sum(group0 == True)
	num_grp1 = sum(group0 == False)
	if num_grp0 > 1 and num_grp1 > 1:
	    fl = robjects.FactorVector(endpoint) # factor for R limma
	    robjects.globalenv["description"] = fl
	    fmla = robjects.Formula('~ description + 0')
            design = rstats.model_matrix(fmla)  
            design.colnames = robjects.StrVector(['Norm','Mut']) 
            # print(design)
            # robjects.globalenv["design"] = design
            fit = limma.lmFit(m, design)
            contMat = robjects.IntVector([-1, 1])   
            fit2 = limma.contrasts_fit(fit, contMat)
            fit2 = limma.eBayes(fit2)   
            corrGenes = limma.decideTests(fit2, adjust_method='fdr', p_value=0.01)
            tT = limma.topTable(fit2, adjust='fdr', sort_by="B", number=ncol, genelist=geneNames)
            # print(r.head(tT))
            # loop through corrGenes, and find the DEGs
            DEGs = []
            for x in xrange(0, ncol):
	      	if corrGenes[x] != 0.0:
		    DEGs.append(x)
            # create expression matrix just containing DEGs
	    DEGs_data = train_data[:, DEGs]
	    numDEGs = DEGs_data.shape[1]
	    # create 2 binary expression matrices; 1. Up-regulated genes; 2. Down-regulated genes
  	    # For 1; if z-score > 2, value = 1, 0 otherwise
Esempio n. 26
0
 def testNewIntVector(self):
     vec = robjects.IntVector([123, 456])
     self.assertEqual(123, vec[0])
     self.assertEqual(456, vec[1])
     self.assertEqual(2, len(vec))
Esempio n. 27
0
coeffs=[]
pvalues=[]
genes=[] ##This list tracks the gene names
for i in range(len(final_genes[0])):
    kaplan=[]
    genes.append(final_genes[0][i][0])
    for k,j in zip(clinical_and_files,final_genes):  ## These lists contain the clinical information and mRNA data in the same order.
        kaplan.append([k[1],k[2],k[3],k[4],k[5],j[i][1]])
    data=[ii[-1] for ii in kaplan]  ## Grabbing all the gene values for the current gene being analyzed
    ro.globalenv['expression']=ro.FloatVector(data)
    res=ro.r('round(qnorm((rank(expression, na.last="keep")-0.5)/sum(!is.na(expression))), digit=5)')  ## Perform inverse normal transformation
    inverse_norm=list(res)  ## Convert robject to python list
    ## Prepare the variables for rpy2
    ro.globalenv['gene']=ro.FloatVector(inverse_norm)
    ro.globalenv['times']=ro.IntVector([ii[0] for ii in kaplan])
    ro.globalenv['died']=ro.IntVector([death_dic[ii[1]] for ii in kaplan])
            
    ##grade1
    grade1=[]
    for ii in kaplan:
        if ii[2]==1:
            grade1.append(1)
        else:
            grade1.append(0)
            
    ##grade2
    grade2=[]
    for ii in kaplan:
        if ii[2]==2:
            grade2.append(1)
Esempio n. 28
0
model_1 = robjects.r['auto.arima'](env['freq_tweet'], trace=True)

b_1 = robjects.r['LjungBoxTest'](robjects.r['residuals'](model_1), k = 1)

rpackages.importr('FitARMA')

robjects.r['checkresiduals'](model_1)


def get_dict_r(list_r):
    return dict(zip(list_r.names,list(list_r)))


model_2_1 = robjects.r['Arima'](env['freq_tweet'], 
                    order = robjects.IntVector([0,1,2]), 
                    seasonal =  robjects.r['list'](order = robjects.IntVector([0,0,1]), period = 7), 
                    )

b_2_1 = robjects.r['LjungBoxTest'](robjects.r['residuals'](model_2_1), k=1)

print(b_2_1)

print(get_dict_r(model_2)['call'])

model_2_2 = robjects.r['Arima'](env['freq_tweet'], 
                    order = robjects.IntVector([0,1,2]), 
                    seasonal =  robjects.r['list'](order = robjects.IntVector([1,0,0]), period = 7), 
                    )

b_2_2 = robjects.r['LjungBoxTest'](robjects.r['residuals'](model_2_2))
Esempio n. 29
0
        clinical_and_files.append(i)

##print average age at diagnosis
age=np.mean([i[5] for i in clinical_and_files])

##print number of males
males=len([i for i in clinical_and_files if i[4]==0])

##print number of females
females=len([i for i in clinical_and_files if i[4]==1])

##to get the median survival we need to call survfit from r


##prepare variables for R
ro.globalenv['times']=ro.IntVector([i[1] for i in clinical_and_files])

##need to create a dummy variable group
ro.globalenv['group']=ro.IntVector([0 for i in clinical_and_files])

##need a vector for deaths
death_dic={}
death_dic['Alive']=0
death_dic['Dead']=1
ro.globalenv['died']=ro.IntVector([death_dic[i[2]] for i in clinical_and_files])

res=ro.r('survfit(Surv(times,died) ~ as.factor(group))')

#the number of events(deaths) is the fourth column of the output
deaths=str(res).split('\n')[-2].strip().split()[3]
Esempio n. 30
0
def train_with_blockcluster(
    dataset_file,
    graph,
    nb_row_clusters,
    nb_column_clusters,
    row_clusters_index,
    column_clusters_index,
):
    results_files_already_done = glob.glob(results_folder + "*.pkl")
    if (results_folder + dataset_file.split("/")[-1].split(".")[0] + "_bc.pkl"
            in results_files_already_done):
        print("Already Done")
        return None

    print("BlockCluster :")
    # Convert sparse matrix to R matrix.
    B = graph.todense()
    nr, nc = B.shape
    Br = ro.r.matrix(B, nrow=nr, ncol=nc)
    # initmethod Method to initialize model parameters. The valid values are "cemInitStep", "emInitStep" and "randomInit"
    #  nbiterationsxem : Number of EM iterations used during xem step. Default value is 50.
    # nbinitmax : Maximal number initialization to try. Default value is 100
    # nbinititerations : Number of Global iterations used in initialization step. Default value is 10.
    # initepsilon : Tolerance value used while initialization. Default value is 1e-2.
    # nbxem : Number of xem steps. Default value is 5.
    strategy = blockcluster.coclusterStrategy(
        initmethod="randomInit",
        nbinitmax=100,
        nbinititerations=10,
        nbiterationsXEM=5000,
        nbiterationsxem=10,
        initepsilon=1e-2,
        epsilonxem=1e-4,
        epsilonXEM=1e-10,
        stopcriteria="Likelihood",
        nbtry=1,
        nbxem=100,
    )

    start_time, start_resources = timestamp(), resource_usage(RUSAGE_SELF)
    results = blockcluster.cocluster(
        Br,
        "binary",
        nbcocluster=robjects.IntVector([nb_row_clusters, nb_column_clusters]),
        nbCore=1,
        strategy=strategy,
    )
    end_resources, end_time = resource_usage(RUSAGE_SELF), timestamp()
    print(end_time - start_time)
    rowclass = np.array(results.slots["rowclass"])
    colclass = np.array(results.slots["colclass"])
    icl = results.slots["ICLvalue"][0]
    co_ari = CARI(row_clusters_index, column_clusters_index, rowclass,
                  colclass)
    """Return `real`, `sys` and `user` elapsed time, like UNIX's command `time`
    You can calculate the amount of used CPU-time used by summing `user`
    and `sys`. `real` is just like the wall clock.
    """
    results = {
        "lib": "blockcluster",
        "n1": graph.shape[0],
        "n2": graph.shape[1],
        "nq": nb_row_clusters,
        "nl": nb_column_clusters,
        "dataset_file": dataset_file,
        "icl": icl,
        "cari": co_ari,
        "real": end_time - start_time,
        "sys": end_resources.ru_stime - start_resources.ru_stime,
        "user": end_resources.ru_utime - start_resources.ru_utime,
    }
    print(f'BlockCluster tt time {results["user"]+results["sys"]}')
    pickle.dump(
        results,
        open(
            results_folder + dataset_file.split("/")[-1].split(".")[0] +
            "_bc.pkl",
            "wb",
        ),
    )
    return results