Exemple #1
0
 def write(self, data):
     group=self.get_data_group(data)
     dest=robjects.globalenv
     if group == 'data':
         datavals = data['data']
         ordering = data['ordering']
         attrlist = []
         nameind = 0
         names = data['names']
         types = data['types']
         for cur_feat in ordering:
             if len(datavals[cur_feat].shape) > 1:
                 for k in range(datavals[cur_feat].shape[0]):
                     if str(types[nameind]).startswith('nominal'):
                         attrlist.append((names[nameind], robjects.FactorVector(robjects.StrVector(datavals[cur_feat][k]))))
                     else:
                         attrlist.append((names[nameind], datavals[cur_feat][k]))
                     nameind += 1
             else:
                 if str(types[nameind]).startswith('nominal'):
                     attrlist.append((names[nameind], robjects.FactorVector(robjects.StrVector(datavals[cur_feat]))))
                 else:
                     attrlist.append((names[nameind], datavals[cur_feat]))
                 nameind += 1
         dest[data['name']] = robjects.DataFrame(rlc.OrdDict(attrlist))
     elif group == 'task':
         d=data[group]
         for k in list(d.keys()):
             dest[k] = d[k]
     robjects.r.save(*list(robjects.r.ls(dest)), file=self.fname)
    def analyse_permanova(self, user_request, otu_table, headers, sample_labels, metadata_values, strata_values, sample_ids_from_metadata):

        print("Starting PERMANOVA")
        groups = robjects.FactorVector(robjects.StrVector(metadata_values))

        # Forms an OTU only table (without IDs)
        allOTUs = []
        col = 0
        while col < len(otu_table[0]):
            colVals = []
            row = 0
            while row < len(otu_table):
                sampleID = sample_labels[row]
                if sampleID in sample_ids_from_metadata:
                    colVals.append(otu_table[row][col])
                row += 1
            allOTUs.append((headers[col], robjects.FloatVector(colVals)))
            col += 1

        od = rlc.OrdDict(allOTUs)
        dataf = robjects.DataFrame(od)

        if strata_values is None:
            permanova = self.veganR.betaDiversityPERMANOVA(dataf, groups)
        else:
            strata = robjects.FactorVector(robjects.StrVector(strata_values))
            permanova = self.veganR.betaDiversityPERMANOVAWithStrata(dataf, groups, strata)
        abundancesObj = {}
        abundancesObj["permanova"] = str(permanova)

        return abundancesObj
Exemple #3
0
def kNNClass(train_idx,test_idx,n_neighbors):
	training_data=input_kmers_counts.loc[train_idx]
	testing_data=input_kmers_counts.loc[test_idx]
	clf = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform")
	clf.fit(training_data[kmer_colums], training_data["class"])
	#print "predicting"
	predicted_classes= clf.predict(testing_data[kmer_colums])
	# compute kappa stat 
	confusion_matrix(testing_data["class"],predicted_classes)
	# make a mapping 
	class_map=dict(zip(set(testing_data["class"]),range(0,4)))
	kapp=kappa([class_map[x] for x in testing_data["class"]],[class_map[x] for x in predicted_classes])
	cm=caret.confusionMatrix(robjects.FactorVector(predicted_classes),robjects.FactorVector(testing_data["class"]))
	return kapp,cm
Exemple #4
0
    def get_r_pre_data_frame(self, model):
        pre_data_frame = {}
        for i in self.phenotype_dataframe.columns:
            if i in model.factors:
                pre_data_frame[i] = robjects.FactorVector(self.phenotype_dataframe[i])
            else:
                # Use either one of Int, Str, or Float vectors
                if self.phenotype_dataframe[i].dtype.type in (np.int32, np.int64):
                    pre_data_frame[i] = robjects.IntVector(self.phenotype_dataframe[i])
                elif self.phenotype_dataframe[i].dtype.type in (np.float32, np.float64):
                    pre_data_frame[i] = robjects.FloatVector(self.phenotype_dataframe[i])
                else:
                    pre_data_frame[i] = robjects.FactorVector(self.phenotype_dataframe[i])

        return robjects.DataFrame(pre_data_frame)
Exemple #5
0
    def convert_pandas_to_r_data_frame(self, model, dataframe):
        pre_data_frame = {}
        for i in dataframe.columns:
            if i in model.factors:
                pre_data_frame[i] = robjects.FactorVector(dataframe[i])
            else:
                # Use either one of Int, Str, or Float vectors
                if dataframe[i].dtype.type in (np.int32, np.int64):
                    pre_data_frame[i] = robjects.IntVector(dataframe[i])
                elif dataframe[i].dtype.type in (np.float32, np.float64):
                    pre_data_frame[i] = robjects.FloatVector(dataframe[i])
                else:
                    pre_data_frame[i] = robjects.FactorVector(dataframe[i])

        return robjects.DataFrame(pre_data_frame)
Exemple #6
0
def batch_adjust(expr_df, batch):
    # sample call: ir.batch_adjust(exprset, batch=[batch1.columns.to_list(), [1,1,1]])
    """
    TODO: some 0 or na somewhere causing regection
         Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) :
         contrasts can be applied only to factors with 2 or more levels
    Args:
        expr_df:
        batch:

    Returns:

    """
    ro.pandas2ri.activate()
    sva = importr('sva')
    base = importr('base')
    batch_vec = ro.Vector(batch)
    # batch can be a pd dataframe. has to have multiple levels
    rbatch = ro.FactorVector(obj=ro.Vector(batch),
                             labels=ro.Vector(batch[0]),
                             levels=ro.Vector(batch[1:]))
    print(rbatch.slots['levels'].r_repr())
    # print(expr_df.slots['assayData'].r_repr())
    # NAs in corrected matrix caused by genes = 0?
    # remove na: batch1 = batch1[(batch1.T != 0).any()]
    # batch needs to have multiple list dimensions (multi-level)
    # df =base.as_matrix(rpd.py2ri_pandasdataframe(expr_df))
    batch_corrected = sva.ComBat(
        dat=expr_df, batch=rbatch)  #, mod=ro.NULL)#, mean_only=True)
    ro.pandas2ri.deactivate()
    return batch_corrected
 def fit(self, X, y):
     self.classes_ = np.unique(y)
     y = np.searchsorted(self.classes_, y) + 1
     X = numpy2ri(X)
     y = ro.FactorVector(numpy2ri(y))
     self.model_ = rf.randomForest(X, y, **self.params)
     return self
Exemple #8
0
 def _preprocessing(self, data_column="", label_column="", save_file=""):
     data = self.r_df.rx2(data_column)
     label_data = ro.FactorVector(self.r_df.rx2(label_column))
     lm_name = f"{data_column} ~ label_data"
     self.save_file = save_file
     self._savemode()
     return label_data, data, lm_name
Exemple #9
0
def _edger_func_exacttest(the_data, the_groups, fdr=0.01, lfc=1, pair=None, return_full=False):
    """
    Run edgeR DE analysis without fitting a GLM. Instead, we just compare two groups. Only a single factor is supported.
    :param the_data:
    :param the_groups:
    :param fdr:
    :param lfc:
    :param pair: An iterable of two group names. If None, compare the first two groups.
    :return:
    """
    if pair is None:
        lvl, fct = pd.factorize(the_groups)
        pair = fct[:2]
    rpair = robjects.StrVector(pair)
    rdata = pandas2ri.py2ri(the_data)
    rgroups = robjects.FactorVector(the_groups)
    y = r("DGEList")(rdata, group=rgroups)
    y = r("calcNormFactors")(y)
    y = r("estimateDisp")(y)
    et = r('exactTest')(y, rpair)
    if return_full:
        toptags = r('topTags')(et, n=r('Inf'), **{'p.value': 1.})
    else:
        toptags = r('topTags')(et, n=r('Inf'), **{'p.value': fdr})
    if len(toptags) == 0:
        return pd.DataFrame(columns=toptags_cols)
    else:
        tt = pandas2ri.ri2py_dataframe(toptags[toptags.names.index('table')])
        if lfc is not None:
            tt = tt.loc[tt.loc[:, 'logFC'].abs() >= lfc]
        return tt
Exemple #10
0
def _edger_func_fit_glm(the_data, the_method, the_formula, common_disp=False, **vars):
    if the_method not in {'GLM', 'QLGLM'}:
        raise NotImplementedError("Only GLM and QLGLM methods are supported at present")
    fit = None
    rdata = pandas2ri.py2ri(the_data)

    formula = robjects.Formula(the_formula)
    for k, v in vars.items():
        formula.environment[k] = robjects.FactorVector(v)

    y = r("DGEList")(rdata)
    y = r("calcNormFactors")(y)
    design = r("model.matrix")(formula)

    if common_disp:
        # use a common estimate of the dispersion rather than using experimental structure
        # this is helpful where we have no replicates
        y = r("estimateGLMCommonDisp")(y, method='deviance', robust=True, subset=robjects.NULL)
    else:
        y = r("estimateDisp")(y, design)
    if the_method == 'GLM':
        fit = r('glmFit')(y, design)
    elif the_method == 'QLGLM':
        fit = r('glmQLFit')(y, design)
    return fit, design
Exemple #11
0
    def fit(self, X, y):
        target_column_name = 'target__'
        if type(X) is pd.DataFrame:
            X.columns = sanitize_column_names(list(X))
            X[target_column_name] = y
        else:
            y = y.reshape((-1, 1))
            X = np.concatenate((y, X), axis=1)
            target_column_name = 'X0'
        X_r, X = fix_types(X)
        formula = robj.Formula(
            additive([target_column_name], None,
                     list(set(list(X)) - set([target_column_name]))))

        if self._categorical_target():
            X_r[X_r.colnames.index(target_column_name)] = robj.FactorVector(
                X_r.rx2(target_column_name))

        self.model = STM(self._utils.cv_glmnet,
                         init_prm_translate={
                             'use_model_frame': 'use.model.frame',
                             'nan_action': 'na.action'
                         })(formula,
                            data=X_r,
                            alpha=self.alpha,
                            family=self.family,
                            nan_action=NAN_ACTIONS_TO_R[self.nan_action],
                            intercept=self.fit_intercept,
                            thresh=self.epsilon,
                            maxit=self.max_iter)
        self.model.rclass = robj.StrVector(('cv.glmnet.formula', 'cv.glmnet'))

        return self
Exemple #12
0
def get_tf_factor(var, from_to, value_col="IMPUTED"):
    r_var = r['as.character'](robjects.FactorVector(var))
    r_from_to = robjects.IntVector(from_to)
    data = r['tf_factor_tbl'](r['as.character'](r_var), r_from_to, value_col)
    data = pandas2ri.ri2py_dataframe(data)
    print(var[0])
    gc.collect()
    return data
Exemple #13
0
    def _gen_LD(self, ld):
        self.origin = 'ld'
        ld = self.input
        dd = {}

        if not self.cols:
            self.cols = self.getCols(self.allcols)
            if type(self.allcols) == type(2):
                self.trimCols(self.allcols)

        for i in range(len(self.cols)):
            k = self.cols[i]
            dd[k] = []

        rownames = []
        for x in ld:
            if self.rownamecol: rownames.append(x[self.rownamecol])

            for k in self.cols:
                try:
                    value = x[k]
                except KeyError:
                    try:
                        value = x[".".join(k.split(".")[1:])]
                    except:
                        value = zero
                dd[k].append(value)

        for k, v in dd.items():
            if type(v[0]) == type(''):
                dd[k] = ro.StrVector(v)
                if self.factor:
                    dd[k] = ro.FactorVector(dd[k])
            else:
                if self.z:
                    v = zfy(v)

                dd[k] = ro.FloatVector(v)

        df = ro.DataFrame(dd)
        if self.rownamecol:
            rownames = ro.FactorVector(rownames)
            #print df.rownames
            df.rownames = rownames

        self.df = df
def umrOZU13WStRuwAb7yjy74py4P9nDW(_eAQj6AfxCjHiGjDCQv3j3sDYG5GlNs):
    _4vlLbfaYzxaqKIWKNQ3i13ZaTe4IA7 = namedtuple(
        '_4vlLbfaYzxaqKIWKNQ3i13ZaTe4IA7', 'regex mean rank')
    _hcHHHtcgtSbo3xoBi4ebbzZLcSVjVX = dict()
    for _bK7HbIcPJKVNexVX06kCq8FuHo1ztf, _XUIpSD336BQTixWZPnHS81vWwkjyX7 in enumerate(
            _eAQj6AfxCjHiGjDCQv3j3sDYG5GlNs):
        _hcHHHtcgtSbo3xoBi4ebbzZLcSVjVX[_bK7HbIcPJKVNexVX06kCq8FuHo1ztf +
                                        1] = _XUIpSD336BQTixWZPnHS81vWwkjyX7[0]

    _FtO8Yau65WLKmh11N82q6Pzb16D6me = []
    _fi02Z8Gb6ZmPXEoeNJx6DtfKa91s2r = []
    for _XUIpSD336BQTixWZPnHS81vWwkjyX7 in _eAQj6AfxCjHiGjDCQv3j3sDYG5GlNs:
        for _ZCWQ8l5191AB4QUr9lh3r217zVu7Lk in _XUIpSD336BQTixWZPnHS81vWwkjyX7[
                1]:
            _FtO8Yau65WLKmh11N82q6Pzb16D6me.append(
                _ZCWQ8l5191AB4QUr9lh3r217zVu7Lk)
            _fi02Z8Gb6ZmPXEoeNJx6DtfKa91s2r.append(
                _XUIpSD336BQTixWZPnHS81vWwkjyX7[0])

    _rtzhc9UHNTo9cHDyEwyLwmIn0QFRgG.globalenv[
        'y'] = _rtzhc9UHNTo9cHDyEwyLwmIn0QFRgG.FloatVector(
            _FtO8Yau65WLKmh11N82q6Pzb16D6me)
    _rtzhc9UHNTo9cHDyEwyLwmIn0QFRgG.globalenv[
        'xf'] = _rtzhc9UHNTo9cHDyEwyLwmIn0QFRgG.FactorVector(
            _fi02Z8Gb6ZmPXEoeNJx6DtfKa91s2r)

    _lqjJeXcvoWyqYhcPq6gkLhRQbpqg7J('library(ScottKnott)')
    _lqjJeXcvoWyqYhcPq6gkLhRQbpqg7J('dfm <- data.frame(y,xf)')
    _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny = None
    _NQ26Xm45nVhQRPlMbS4IscbNximbY4 = []
    try:
        _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny = _lqjJeXcvoWyqYhcPq6gkLhRQbpqg7J(
            'summary(SK(dfm, y=y, model="y ~ xf", which="xf"))')
    except Exception as _Zg3nyB5TAE87Jk7AFjifamy6D13zo8:
        _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny = skpy.scottknott([
            skpy.Num(_f9wnkJHAz6pHosi6UT5TSZLtmtwwHb[0],
                     _f9wnkJHAz6pHosi6UT5TSZLtmtwwHb[1]) for
            _f9wnkJHAz6pHosi6UT5TSZLtmtwwHb in _eAQj6AfxCjHiGjDCQv3j3sDYG5GlNs
        ])
        for _NNfel7ez8uCGlXneJy9Su4r1VrnJH4 in _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny:
            _NQ26Xm45nVhQRPlMbS4IscbNximbY4.append(
                _4vlLbfaYzxaqKIWKNQ3i13ZaTe4IA7(
                    _NNfel7ez8uCGlXneJy9Su4r1VrnJH4.name,
                    mean(_NNfel7ez8uCGlXneJy9Su4r1VrnJH4.all),
                    _NNfel7ez8uCGlXneJy9Su4r1VrnJH4.rank + 1))
    else:
        for _bK7HbIcPJKVNexVX06kCq8FuHo1ztf, _PVB41p4KA03UsmQif3rzj5rWRy0x22 in enumerate(
                _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny[0]):
            _NQ26Xm45nVhQRPlMbS4IscbNximbY4.append(
                _4vlLbfaYzxaqKIWKNQ3i13ZaTe4IA7(
                    _hcHHHtcgtSbo3xoBi4ebbzZLcSVjVX[
                        _PVB41p4KA03UsmQif3rzj5rWRy0x22],
                    _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny[1]
                    [_bK7HbIcPJKVNexVX06kCq8FuHo1ztf],
                    _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny[2]
                    [_bK7HbIcPJKVNexVX06kCq8FuHo1ztf]))

    return _NQ26Xm45nVhQRPlMbS4IscbNximbY4
    def analyse(self, user_request, otuTable, headers, metaVals, taxonomy_map):
        otu_to_genus = {}
        if int(user_request.level) == -1:
            # We want to display a short hint for the OTU using the genus (column 5)
            for header in headers:
                if header in taxonomy_map and len(taxonomy_map[header]) > 5:
                    otu_to_genus[header] = taxonomy_map[header][5]
                else:
                    otu_to_genus[header] = ""

        groups = robjects.FactorVector(robjects.StrVector(metaVals))
        # Forms an OTU only table (without IDs)
        allOTUs = []
        col = 0
        while col < len(otuTable[0]):
            allOTUs.append((headers[col], otuTable[:, col]))
            col += 1

        od = rlc.OrdDict(allOTUs)
        dataf = robjects.DataFrame(od)

        catVar1 = user_request.get_custom_attr("pwVar1")
        catVar2 = user_request.get_custom_attr("pwVar2")
        minthreshold = user_request.get_custom_attr("minthreshold")

        fisherResults = self.rStats.fisher_exact(dataf, groups, catVar1,
                                                 catVar2, int(minthreshold))

        hints = {}
        results = []
        i = 1
        while i <= fisherResults.nrow:
            newRow = []
            j = 1
            while j <= fisherResults.ncol:
                if j > 1:
                    newRow.append(round(float(fisherResults.rx(i, j)[0]), 6))
                else:
                    newRow.append(str(fisherResults.rx(i, j)[0]))
                j += 1
            otu = newRow[0]
            if int(user_request.level) == -1:
                hints[otu] = otu_to_genus[otu]
            i += 1
            results.append(newRow)

        cat1 = catVar1
        cat2 = catVar2
        abundancesObj = {}
        abundancesObj["results"] = results
        abundancesObj["hints"] = hints
        abundancesObj["cat1"] = cat1
        abundancesObj["cat2"] = cat2

        return abundancesObj
Exemple #16
0
def test_kw_r(cls,feats,p,factors):
    robjects.globalenv["y"] = robjects.FloatVector(feats)
    for i,f in enumerate(factors):
        robjects.globalenv['x'+str(i+1)] = robjects.FactorVector(robjects.StrVector(cls[f]))
    fo = "y~x1"
    #for i,f in enumerate(factors[1:]):
    #   if f == "subclass" and len(set(cls[f])) <= len(set(cls["class"])): continue
    #   if len(set(cls[f])) == len(cls[f]): continue
    #   fo += "+x"+str(i+2)
    kw_res = robjects.r('kruskal.test('+fo+',)$p.value')
    return float(tuple(kw_res)[0]) < p, float(tuple(kw_res)[0])
Exemple #17
0
def predictTestData(train, test):
    pandas2ri.activate()
    ro.conversion.py2ri = ro.numpy2ri
    ro.numpy2ri.activate()
    df1 = pd.read_csv(train)
    df2 = pd.read_csv(train)
    df3 = pd.read_csv(test)
    df31 = np.asarray(pd.read_csv(test))
    Xtrain = np.asarray(df1.iloc[:, :-1])
    Xtest = np.asarray(df3.iloc[:, :-1])
    ytr = df2.iloc[:, -1].values
    yte = np.random.randint(1, 3, len(df3))
    irf = importr("iRF")
    auc = importr("AUC")
    base = importr("base")
    graphics = importr("graphics")
    grdevices = importr("grDevices")
    R = ro.r
    Xtr_mat1 = numpy2ri(Xtrain)
    Xte_mat1 = numpy2ri(Xtest)
    ytr_mat = ro.FactorVector(ytr)
    yte_mat = ro.FactorVector(yte)
    Xtr_mat = r.assign("bar", Xtr_mat1)
    Xte_mat = r.assign("bar", Xte_mat1)
    tempyte_mat = ytr_mat
    ncol = robjects.r('ncol')
    rep = robjects.r('rep')
    p1 = ncol(df2)
    p = p1[0]
    selprob = rep(1 / p, p)
    rf = robjects.r('list()')
    b = irf.randomForest(Xtr_mat,
                         ytr_mat,
                         Xte_mat,
                         base.sample(yte_mat),
                         selprob,
                         ntree=400)
    print('Prediction finished')
    pred_1 = list(b[16][0])
    return pred_1
Exemple #18
0
    def create_r_pre_data_frame(self, model):
        pre_data_frame = {}
        for i in model.variables:
            if i in model.factors:
                pre_data_frame[i] = robjects.FactorVector(self.demographic_data[i])
            else:
                # Use either one of Int, Str, or Float vectors
                if self.demographic_data[i][0].dtype.type is np.int64:
                    pre_data_frame[i] = robjects.IntVector(self.demographic_data[i])
                elif self.demographic_data[i][0].dtype.type is np.float64:
                    pre_data_frame[i] = robjects.FloatVector(self.demographic_data[i])

        return pre_data_frame
Exemple #19
0
    def _make_model_matrix(self, columns=None, formula='~0+x'):
        """
        Make the stats model matrix in R
        :param: formula: str; R formula character used to create the model matrix
        :return:    R-matrix
        """
        # Make an robject for the model matrix
        if columns is not None:
            r_sample_labels = robjects.FactorVector(columns)
            str_set = sorted(list(set(columns)))
        else:
            r_sample_labels = robjects.FactorVector(self.labels)
            str_set = sorted(list(set(self.labels)))

        # Create R formula object, and change the environment variable
        fmla = robjects.Formula(formula)
        fmla.environment['x'] = r_sample_labels

        # Make the design matrix. stats is a bound R package
        design = stats.model_matrix(fmla)
        design.colnames = robjects.StrVector(str_set)
        return design
def kNNClass(train_idx, test_idx, n_neighbors, k_mer_subset):
    logger.info('computing for %s' % (k_mer_subset))
    train_idx = train_idx
    test_idx = test_idx
    training_subset = normalized_counts.loc[train_idx][np.append(
        k_mer_subset, "class")]
    testing_subset = normalized_counts.loc[test_idx][np.append(
        k_mer_subset, "class")]
    clf = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform")
    clf.fit(training_subset[k_mer_subset], training_subset["class"])
    #print "predicting"
    predicted_classes = clf.predict(testing_data[k_mer_subset])
    # compute kappa stat
    confusion_matrix(testing_data["class"], predicted_classes)
    # make a mapping
    class_map = dict(zip(set(testing_data["class"]), range(0, 4)))
    kapp = kappa([class_map[x] for x in testing_data["class"]],
                 [class_map[x] for x in predicted_classes])
    cm = caret.confusionMatrix(robjects.FactorVector(predicted_classes),
                               robjects.FactorVector(testing_data["class"]))
    logger.info("Finished for %s with kappa==%f" % (k_mer_subset, kapp))
    return kapp, cm
Exemple #21
0
def get_deseq2_stats(df: pd.DataFrame,
                     subsets: List[List[T]],
                     min_total_row_count: int = 0) -> pd.DataFrame:
    """Use the R bioconductor package 'limma' to perform a differential
    expression analysis of count like data (e.g. miRNA). See package
    documentation for more details.
    :param df: Matrix of counts, where each column is a sample and each row
    a feature.
    :param subsets: The two subsets to compare with each other.
    :param min_total_row_count: Drop rows that have in total less than than
        min_total_row_count reads
    :return: Results of the analysis in form of a Dataframe (p, logFC, ...)
    """
    logger.debug("Computing deseq2 stats")
    if len(subsets) != 2:
        error = "This method currently only supports exactly two " \
                "subsets as this is the most common use case. Support " \
                "for more subsets will be added later."
        logger.exception(error)
        raise ValueError(error)
    # flatten subset
    flattened_subsets = [x for subset in subsets for x in subset]
    # discard columns that are not in a subset
    df = df[flattened_subsets]
    # filter rows with too few reads
    total_row_counts = df.sum(axis=1)
    keep = total_row_counts[total_row_counts >= min_total_row_count].index
    df = df.loc[keep]
    # pandas df -> R df
    r_count_data = pandas2ri.py2ri(df)
    # py2ri is stupid and makes too many assumptions.
    # These two lines restore the column order
    r_count_data.colnames = list(OrderedDict.fromkeys(flattened_subsets))
    r_count_data = r_count_data.rx(robj.StrVector(flattened_subsets))

    # see package documentation
    condition = ['s{}'.format(i) for i, subset in enumerate(subsets)
                 for _ in subset]
    r_condition = robj.FactorVector(robj.StrVector(condition))
    r_col_data = r['DataFrame'](condition=r_condition)
    r_design = robj.Formula('~ condition')
    r_design.environment['condition'] = r_condition
    r_dds = r['DESeqDataSetFromMatrix'](r_count_data, r_col_data, r_design)
    r_dds = r['DESeq'](r_dds, parallel=True)
    r_res = r['results'](r_dds)

    # R result table to Python pandas
    r_res = r['as.data.frame'](r_res)
    results = pandas2ri.ri2py(r_res)
    results.insert(0, 'feature', list(r['row.names'](r_res)))
    return results
Exemple #22
0
def mplotHis(moptions):
    perclist, rankgrouplist, rankperclist, split_points, myRankStr = group_rank(
        moptions)

    figname = moptions["FileID"]
    mresfolder = moptions['outFolder']

    ggplot = importr('ggplot2')
    importr('gridExtra')

    spvector = robjects.IntVector(split_points)
    rankstrvector = robjects.StrVector(myRankStr)

    moptions['CaseSizes'].sort()
    csvector = robjects.IntVector(moptions['CaseSizes'])

    #mdfperc = robjects.DataFrame({"MixedPerc":robjects.FactorVector(robjects.FloatVector(perclist), levels=percvector, labels=percvector), "Rank":robjects.FactorVector(robjects.StrVector(rankgrouplist), levels=rankstrvector, labels=rankstrvector), "Fraction":robjects.FloatVector(rankperclist)})
    mdfperc = robjects.DataFrame({
        "MixedPerc":
        robjects.FactorVector(robjects.IntVector(perclist),
                              levels=csvector,
                              labels=csvector),
        "Percentile":
        robjects.FactorVector(robjects.StrVector(rankgrouplist),
                              levels=rankstrvector,
                              labels=rankstrvector),
        "Fraction":
        robjects.FloatVector(rankperclist)
    })

    robjects.r(resource_string(__name__, 'Rscript/Hist_sim_plot.R'))
    robjects.r('pdf("' + mresfolder + '/hist2_' + figname + '.pdf", width=' +
               ("%.0f" % (len(moptions["CaseSizes"]) * 0.8)) +
               ', height=4, onefile = TRUE)')

    robjects.globalenv['Hist_sim_plot'](mdfperc, spvector, rankstrvector)

    robjects.r('dev.off()')
Exemple #23
0
 def createC50(self):
     #convert array to Factor R
     clas = robjects.FactorVector(self.clas)
     for i in self.trein.keys():
         self.trein[i] = robjects.FloatVector(self.trein[i])
     dataf = robjects.DataFrame(self.trein)
     ad = c50.C5_0(dataf,
                   clas,
                   triasl=20,
                   control=(c50.C5_0Control(minCases=2,
                                            noGlobalPruning=True,
                                            CF=1)))
     self.ad = (base.summary(ad))
     return ad
Exemple #24
0
def generate_histogram(subgroups_to_sses_to_n_count, tname, file_name):
    columns_to_data = {'subgroup': [], tname: [], 'count': []}
    max_count = 0
    for subgroup, sses_to_n_count in subgroups_to_sses_to_n_count.items():
        for ss, n_count in sses_to_n_count.items():
            columns_to_data['subgroup'].append(subgroup)
            columns_to_data[tname].append(ss)
            columns_to_data['count'].append(n_count)
            if n_count > max_count:
                max_count = n_count
    r_columns_to_data = {
        'subgroup':
        ro.FactorVector(columns_to_data['subgroup'],
                        levels=ro.StrVector(
                            _sort_subgroup(set(columns_to_data['subgroup'])))),
        tname:
        ro.StrVector(columns_to_data[tname]),
        'count':
        ro.IntVector(columns_to_data['count'])
    }
    df = ro.DataFrame(r_columns_to_data)

    max_count = int(max_count / 1000 * 1000 + 1000)
    histogram_file_path = os.path.join(OUTPUT_PATH, file_name)
    logging.debug(
        str.format("The Data Frame for file {}: \n{}", histogram_file_path,
                   df))

    grdevices.png(file=histogram_file_path, width=1200, height=800)
    gp = ggplot2.ggplot(df)
    pp = gp + \
         ggplot2.aes_string(x='subgroup', y='count', fill=tname) + \
         ggplot2.geom_bar(position="dodge",width=0.8, stat="identity") + \
         ggplot2.theme_bw() + \
         ggplot2.theme_classic() + \
         ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \
         ggplot2.theme(**{'legend.text': ggplot2.element_text(size=40)}) + \
         ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=40,angle=45)}) + \
         ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=40)}) + \
         ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]),
                                    limits=ro.IntVector([0, max_count])) + \
         ggplot2.geom_text(ggplot2.aes_string(label='count'), size=6, angle=35, hjust=-0.1,
                           position=ggplot2.position_dodge(width=0.8),
                           vjust=-0.2)

    pp.plot()
    logging.info(str.format("Output step3 file {}", histogram_file_path))
    grdevices.dev_off()
def test_kw_r(cls,feats,p,factors):
	###cls= class with all information
	###feats = relative abundance
	###p = p-value sets in params
	###factors= informations about the class component
	robjects.globalenv["y"] = robjects.FloatVector(feats)
	for i,f in enumerate(factors):
		robjects.globalenv['x'+str(i+1)] = robjects.FactorVector(robjects.StrVector(cls[f]))
	fo = "y~x1"
    #for i,f in enumerate(factors[1:]):
    #	if f == "subclass" and len(set(cls[f])) <= len(set(cls["class"])): continue
    #	if len(set(cls[f])) == len(cls[f]): continue
    #	fo += "+x"+str(i+2)
    ###Try to retrive the p-value from here
	kw_res = robjects.r('kruskal.test('+fo+',)$p.value')
	###kw_res results in p-value
	return float(tuple(kw_res)[0]) < p, float(tuple(kw_res)[0])
def dtk_multi_metrics(gather_df, 
                      col_indices=list(range(10)) + list(range(15, 20)) + list(range(25, 65)) + list(range(145, 149)) + list(range(150, 183)),
                      alpha=0.05):
    metric_names = list(gather_df.columns.values[col_indices])
    model_names = list(gather_df.index.levels[0])
    dtk_dict = {}
    dtk_lib = rpackages.importr('DTK')
    #drop fold means and medians
    gather_df = gather_df[metric_names]
    gather_df = gather_df.xs('test_metrics', level='set')
    gather_df = gather_df.drop('Folds Mean', level='fold')
    gather_df = gather_df.drop('Folds Median', level='fold')        
    
    #get fold count
    model_names_rep = []
    for m in model_names:        
        k = gather_df.xs(m, level='model').shape[0]
        model_names_rep.extend([m for _ in range(k)])
    
    
    index_names_1 = []
    index_names_2 = []
    for i in range(len(model_names)):
        for j in range(i+1, len(model_names)):
            index_names_1.append(model_names[j])
            index_names_2.append(model_names[i])
       
    for i, metric in zip(range(len(metric_names)), metric_names):
        m_df = gather_df[metric]
        m_df.sort_index(inplace=True)
        m_df = m_df.loc[model_names]
        m_df_mat = np.around(m_df.as_matrix(), decimals=4)
        
        dtk_results = dtk_lib.DTK_test(robjects.FloatVector(m_df_mat), robjects.FactorVector(model_names_rep), alpha)
        dtk_results = np.array(dtk_results[1])        
        dtk_pd = pd.DataFrame(data=[index_names_1, index_names_2, list(dtk_results[:,0]),list(dtk_results[:,1]),list(dtk_results[:,2]), [False for _ in range(len(index_names_1))]]).T
        dtk_pd.columns = ['group1', 'group2', 'meandiff', 'Lower CI', 'Upper CI', 'reject'] 
        
        for j in range(dtk_pd.shape[0]):      
            if dtk_pd.iloc[j,3] > 0 or dtk_pd.iloc[j,4] < 0:
                dtk_pd.iloc[j,5] = True
                
        dtk_dict[metric] = dtk_pd
    
    return dtk_dict
    def analyse(self, user_request, otuTable, headers, metaVals, taxonomy_map):
        otu_to_genus = {}
        if int(user_request.level) == -1:
            # We want to display a short hint for the OTU using the genus (column 5)
            for header in headers:
                if header in taxonomy_map and len(taxonomy_map[header]) > 5:
                    otu_to_genus[header] = taxonomy_map[header][5]
                else:
                    otu_to_genus[header] = ""

        groups = robjects.FactorVector(robjects.StrVector(metaVals))

        allOTUs = []
        col = 0
        while col < len(otuTable[0]):
            allOTUs.append((headers[col], otuTable[:, col]))
            col += 1

        od = rlc.OrdDict(allOTUs)
        dataf = robjects.DataFrame(od)

        pval = user_request.get_custom_attr("pval")
        maxruns = user_request.get_custom_attr("maxruns")

        borutaResults = self.rStats.boruta(dataf, groups, float(pval),
                                           int(maxruns))

        assignments = {}
        hints = {}

        i = 0
        for lab in borutaResults.iter_labels():
            if lab in assignments:
                assignments[lab].append(allOTUs[i][0])
            else:
                assignments[lab] = [allOTUs[i][0]]
            if int(user_request.level) == -1:
                hints[allOTUs[i][0]] = otu_to_genus[allOTUs[i][0]]
            i += 1

        abundancesObj = {}
        abundancesObj["results"] = assignments
        abundancesObj["hints"] = hints

        return abundancesObj
Exemple #28
0
def _edger_func_fit(the_data, the_groups, the_method):
    if the_method not in {'GLM', 'QLGLM'}:
        raise NotImplementedError("Only GLM and QLGLM methods are supported at present")
    fit = None
    rdata = pandas2ri.py2ri(the_data)
    rgroups = robjects.FactorVector(the_groups)
    y = r("DGEList")(rdata)
    y = r("calcNormFactors")(y)
    formula = robjects.Formula("~0 + groups")
    formula.environment['groups'] = rgroups
    design = r("model.matrix")(formula)
    design.colnames = r('levels')(rgroups)
    y = r("estimateDisp")(y, design)
    if the_method == 'GLM':
        fit = r('glmFit')(y, design)
    elif the_method == 'QLGLM':
        fit = r('glmQLFit')(y, design)
    return fit, design
Exemple #29
0
def multi(jurisdictions, causes):
    with openrlib.rlock:
        r = ro.r
        ro.globalenv['jurisdictions'] = jurisdictions
        ro.globalenv['causes'] = causes
        r_df = r.source('farrington_multiselect.R')
        cause_fac = ro.FactorVector(r_df[0][1])
        cause_group = [cause_fac.levels[i - 1] for i in r_df[0][1]]
        week_ending = [serial_date_to_string(i) for i in r_df[0][6]]
        observed = [int(i) for i in r_df[0][7]]
        alarm = [['', 'x'][int(i)] for i in r_df[0][9]]
        upperbound = [float(i) for i in r_df[0][10]]
        lowerbound = [float(i) for i in r_df[0][11]]
        df = {
            'week_ending': week_ending,
            'observed': observed,
            'alarm': alarm,
            'upperbound': upperbound,
            'lowerbound': lowerbound
        }
        pass
    return df
Exemple #30
0
    def _filter_expression(self, data):
        """
        Filter low expressing genes
        :param data: R data.frame
        :return:
        """
        # Get the groups. Unclear if this is used
        groups = robjects.FactorVector(self.labels)

        # Make DGEList (Digital Gene Expression) from counts
        dge = edgeR.DGEList(counts=data, group=groups)

        # Decide which genes to filter
        keep = edgeR.filterByExpr(dge, self.design)

        # boolean filter, keep all columns, keep.lib.size=False
        dge = dge.rx(keep, True, False)

        # Renorm
        dge = edgeR.calcNormFactors(dge)

        return dge