def write(self, data): group=self.get_data_group(data) dest=robjects.globalenv if group == 'data': datavals = data['data'] ordering = data['ordering'] attrlist = [] nameind = 0 names = data['names'] types = data['types'] for cur_feat in ordering: if len(datavals[cur_feat].shape) > 1: for k in range(datavals[cur_feat].shape[0]): if str(types[nameind]).startswith('nominal'): attrlist.append((names[nameind], robjects.FactorVector(robjects.StrVector(datavals[cur_feat][k])))) else: attrlist.append((names[nameind], datavals[cur_feat][k])) nameind += 1 else: if str(types[nameind]).startswith('nominal'): attrlist.append((names[nameind], robjects.FactorVector(robjects.StrVector(datavals[cur_feat])))) else: attrlist.append((names[nameind], datavals[cur_feat])) nameind += 1 dest[data['name']] = robjects.DataFrame(rlc.OrdDict(attrlist)) elif group == 'task': d=data[group] for k in list(d.keys()): dest[k] = d[k] robjects.r.save(*list(robjects.r.ls(dest)), file=self.fname)
def analyse_permanova(self, user_request, otu_table, headers, sample_labels, metadata_values, strata_values, sample_ids_from_metadata): print("Starting PERMANOVA") groups = robjects.FactorVector(robjects.StrVector(metadata_values)) # Forms an OTU only table (without IDs) allOTUs = [] col = 0 while col < len(otu_table[0]): colVals = [] row = 0 while row < len(otu_table): sampleID = sample_labels[row] if sampleID in sample_ids_from_metadata: colVals.append(otu_table[row][col]) row += 1 allOTUs.append((headers[col], robjects.FloatVector(colVals))) col += 1 od = rlc.OrdDict(allOTUs) dataf = robjects.DataFrame(od) if strata_values is None: permanova = self.veganR.betaDiversityPERMANOVA(dataf, groups) else: strata = robjects.FactorVector(robjects.StrVector(strata_values)) permanova = self.veganR.betaDiversityPERMANOVAWithStrata(dataf, groups, strata) abundancesObj = {} abundancesObj["permanova"] = str(permanova) return abundancesObj
def kNNClass(train_idx,test_idx,n_neighbors): training_data=input_kmers_counts.loc[train_idx] testing_data=input_kmers_counts.loc[test_idx] clf = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform") clf.fit(training_data[kmer_colums], training_data["class"]) #print "predicting" predicted_classes= clf.predict(testing_data[kmer_colums]) # compute kappa stat confusion_matrix(testing_data["class"],predicted_classes) # make a mapping class_map=dict(zip(set(testing_data["class"]),range(0,4))) kapp=kappa([class_map[x] for x in testing_data["class"]],[class_map[x] for x in predicted_classes]) cm=caret.confusionMatrix(robjects.FactorVector(predicted_classes),robjects.FactorVector(testing_data["class"])) return kapp,cm
def get_r_pre_data_frame(self, model): pre_data_frame = {} for i in self.phenotype_dataframe.columns: if i in model.factors: pre_data_frame[i] = robjects.FactorVector(self.phenotype_dataframe[i]) else: # Use either one of Int, Str, or Float vectors if self.phenotype_dataframe[i].dtype.type in (np.int32, np.int64): pre_data_frame[i] = robjects.IntVector(self.phenotype_dataframe[i]) elif self.phenotype_dataframe[i].dtype.type in (np.float32, np.float64): pre_data_frame[i] = robjects.FloatVector(self.phenotype_dataframe[i]) else: pre_data_frame[i] = robjects.FactorVector(self.phenotype_dataframe[i]) return robjects.DataFrame(pre_data_frame)
def convert_pandas_to_r_data_frame(self, model, dataframe): pre_data_frame = {} for i in dataframe.columns: if i in model.factors: pre_data_frame[i] = robjects.FactorVector(dataframe[i]) else: # Use either one of Int, Str, or Float vectors if dataframe[i].dtype.type in (np.int32, np.int64): pre_data_frame[i] = robjects.IntVector(dataframe[i]) elif dataframe[i].dtype.type in (np.float32, np.float64): pre_data_frame[i] = robjects.FloatVector(dataframe[i]) else: pre_data_frame[i] = robjects.FactorVector(dataframe[i]) return robjects.DataFrame(pre_data_frame)
def batch_adjust(expr_df, batch): # sample call: ir.batch_adjust(exprset, batch=[batch1.columns.to_list(), [1,1,1]]) """ TODO: some 0 or na somewhere causing regection Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]) : contrasts can be applied only to factors with 2 or more levels Args: expr_df: batch: Returns: """ ro.pandas2ri.activate() sva = importr('sva') base = importr('base') batch_vec = ro.Vector(batch) # batch can be a pd dataframe. has to have multiple levels rbatch = ro.FactorVector(obj=ro.Vector(batch), labels=ro.Vector(batch[0]), levels=ro.Vector(batch[1:])) print(rbatch.slots['levels'].r_repr()) # print(expr_df.slots['assayData'].r_repr()) # NAs in corrected matrix caused by genes = 0? # remove na: batch1 = batch1[(batch1.T != 0).any()] # batch needs to have multiple list dimensions (multi-level) # df =base.as_matrix(rpd.py2ri_pandasdataframe(expr_df)) batch_corrected = sva.ComBat( dat=expr_df, batch=rbatch) #, mod=ro.NULL)#, mean_only=True) ro.pandas2ri.deactivate() return batch_corrected
def fit(self, X, y): self.classes_ = np.unique(y) y = np.searchsorted(self.classes_, y) + 1 X = numpy2ri(X) y = ro.FactorVector(numpy2ri(y)) self.model_ = rf.randomForest(X, y, **self.params) return self
def _preprocessing(self, data_column="", label_column="", save_file=""): data = self.r_df.rx2(data_column) label_data = ro.FactorVector(self.r_df.rx2(label_column)) lm_name = f"{data_column} ~ label_data" self.save_file = save_file self._savemode() return label_data, data, lm_name
def _edger_func_exacttest(the_data, the_groups, fdr=0.01, lfc=1, pair=None, return_full=False): """ Run edgeR DE analysis without fitting a GLM. Instead, we just compare two groups. Only a single factor is supported. :param the_data: :param the_groups: :param fdr: :param lfc: :param pair: An iterable of two group names. If None, compare the first two groups. :return: """ if pair is None: lvl, fct = pd.factorize(the_groups) pair = fct[:2] rpair = robjects.StrVector(pair) rdata = pandas2ri.py2ri(the_data) rgroups = robjects.FactorVector(the_groups) y = r("DGEList")(rdata, group=rgroups) y = r("calcNormFactors")(y) y = r("estimateDisp")(y) et = r('exactTest')(y, rpair) if return_full: toptags = r('topTags')(et, n=r('Inf'), **{'p.value': 1.}) else: toptags = r('topTags')(et, n=r('Inf'), **{'p.value': fdr}) if len(toptags) == 0: return pd.DataFrame(columns=toptags_cols) else: tt = pandas2ri.ri2py_dataframe(toptags[toptags.names.index('table')]) if lfc is not None: tt = tt.loc[tt.loc[:, 'logFC'].abs() >= lfc] return tt
def _edger_func_fit_glm(the_data, the_method, the_formula, common_disp=False, **vars): if the_method not in {'GLM', 'QLGLM'}: raise NotImplementedError("Only GLM and QLGLM methods are supported at present") fit = None rdata = pandas2ri.py2ri(the_data) formula = robjects.Formula(the_formula) for k, v in vars.items(): formula.environment[k] = robjects.FactorVector(v) y = r("DGEList")(rdata) y = r("calcNormFactors")(y) design = r("model.matrix")(formula) if common_disp: # use a common estimate of the dispersion rather than using experimental structure # this is helpful where we have no replicates y = r("estimateGLMCommonDisp")(y, method='deviance', robust=True, subset=robjects.NULL) else: y = r("estimateDisp")(y, design) if the_method == 'GLM': fit = r('glmFit')(y, design) elif the_method == 'QLGLM': fit = r('glmQLFit')(y, design) return fit, design
def fit(self, X, y): target_column_name = 'target__' if type(X) is pd.DataFrame: X.columns = sanitize_column_names(list(X)) X[target_column_name] = y else: y = y.reshape((-1, 1)) X = np.concatenate((y, X), axis=1) target_column_name = 'X0' X_r, X = fix_types(X) formula = robj.Formula( additive([target_column_name], None, list(set(list(X)) - set([target_column_name])))) if self._categorical_target(): X_r[X_r.colnames.index(target_column_name)] = robj.FactorVector( X_r.rx2(target_column_name)) self.model = STM(self._utils.cv_glmnet, init_prm_translate={ 'use_model_frame': 'use.model.frame', 'nan_action': 'na.action' })(formula, data=X_r, alpha=self.alpha, family=self.family, nan_action=NAN_ACTIONS_TO_R[self.nan_action], intercept=self.fit_intercept, thresh=self.epsilon, maxit=self.max_iter) self.model.rclass = robj.StrVector(('cv.glmnet.formula', 'cv.glmnet')) return self
def get_tf_factor(var, from_to, value_col="IMPUTED"): r_var = r['as.character'](robjects.FactorVector(var)) r_from_to = robjects.IntVector(from_to) data = r['tf_factor_tbl'](r['as.character'](r_var), r_from_to, value_col) data = pandas2ri.ri2py_dataframe(data) print(var[0]) gc.collect() return data
def _gen_LD(self, ld): self.origin = 'ld' ld = self.input dd = {} if not self.cols: self.cols = self.getCols(self.allcols) if type(self.allcols) == type(2): self.trimCols(self.allcols) for i in range(len(self.cols)): k = self.cols[i] dd[k] = [] rownames = [] for x in ld: if self.rownamecol: rownames.append(x[self.rownamecol]) for k in self.cols: try: value = x[k] except KeyError: try: value = x[".".join(k.split(".")[1:])] except: value = zero dd[k].append(value) for k, v in dd.items(): if type(v[0]) == type(''): dd[k] = ro.StrVector(v) if self.factor: dd[k] = ro.FactorVector(dd[k]) else: if self.z: v = zfy(v) dd[k] = ro.FloatVector(v) df = ro.DataFrame(dd) if self.rownamecol: rownames = ro.FactorVector(rownames) #print df.rownames df.rownames = rownames self.df = df
def umrOZU13WStRuwAb7yjy74py4P9nDW(_eAQj6AfxCjHiGjDCQv3j3sDYG5GlNs): _4vlLbfaYzxaqKIWKNQ3i13ZaTe4IA7 = namedtuple( '_4vlLbfaYzxaqKIWKNQ3i13ZaTe4IA7', 'regex mean rank') _hcHHHtcgtSbo3xoBi4ebbzZLcSVjVX = dict() for _bK7HbIcPJKVNexVX06kCq8FuHo1ztf, _XUIpSD336BQTixWZPnHS81vWwkjyX7 in enumerate( _eAQj6AfxCjHiGjDCQv3j3sDYG5GlNs): _hcHHHtcgtSbo3xoBi4ebbzZLcSVjVX[_bK7HbIcPJKVNexVX06kCq8FuHo1ztf + 1] = _XUIpSD336BQTixWZPnHS81vWwkjyX7[0] _FtO8Yau65WLKmh11N82q6Pzb16D6me = [] _fi02Z8Gb6ZmPXEoeNJx6DtfKa91s2r = [] for _XUIpSD336BQTixWZPnHS81vWwkjyX7 in _eAQj6AfxCjHiGjDCQv3j3sDYG5GlNs: for _ZCWQ8l5191AB4QUr9lh3r217zVu7Lk in _XUIpSD336BQTixWZPnHS81vWwkjyX7[ 1]: _FtO8Yau65WLKmh11N82q6Pzb16D6me.append( _ZCWQ8l5191AB4QUr9lh3r217zVu7Lk) _fi02Z8Gb6ZmPXEoeNJx6DtfKa91s2r.append( _XUIpSD336BQTixWZPnHS81vWwkjyX7[0]) _rtzhc9UHNTo9cHDyEwyLwmIn0QFRgG.globalenv[ 'y'] = _rtzhc9UHNTo9cHDyEwyLwmIn0QFRgG.FloatVector( _FtO8Yau65WLKmh11N82q6Pzb16D6me) _rtzhc9UHNTo9cHDyEwyLwmIn0QFRgG.globalenv[ 'xf'] = _rtzhc9UHNTo9cHDyEwyLwmIn0QFRgG.FactorVector( _fi02Z8Gb6ZmPXEoeNJx6DtfKa91s2r) _lqjJeXcvoWyqYhcPq6gkLhRQbpqg7J('library(ScottKnott)') _lqjJeXcvoWyqYhcPq6gkLhRQbpqg7J('dfm <- data.frame(y,xf)') _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny = None _NQ26Xm45nVhQRPlMbS4IscbNximbY4 = [] try: _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny = _lqjJeXcvoWyqYhcPq6gkLhRQbpqg7J( 'summary(SK(dfm, y=y, model="y ~ xf", which="xf"))') except Exception as _Zg3nyB5TAE87Jk7AFjifamy6D13zo8: _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny = skpy.scottknott([ skpy.Num(_f9wnkJHAz6pHosi6UT5TSZLtmtwwHb[0], _f9wnkJHAz6pHosi6UT5TSZLtmtwwHb[1]) for _f9wnkJHAz6pHosi6UT5TSZLtmtwwHb in _eAQj6AfxCjHiGjDCQv3j3sDYG5GlNs ]) for _NNfel7ez8uCGlXneJy9Su4r1VrnJH4 in _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny: _NQ26Xm45nVhQRPlMbS4IscbNximbY4.append( _4vlLbfaYzxaqKIWKNQ3i13ZaTe4IA7( _NNfel7ez8uCGlXneJy9Su4r1VrnJH4.name, mean(_NNfel7ez8uCGlXneJy9Su4r1VrnJH4.all), _NNfel7ez8uCGlXneJy9Su4r1VrnJH4.rank + 1)) else: for _bK7HbIcPJKVNexVX06kCq8FuHo1ztf, _PVB41p4KA03UsmQif3rzj5rWRy0x22 in enumerate( _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny[0]): _NQ26Xm45nVhQRPlMbS4IscbNximbY4.append( _4vlLbfaYzxaqKIWKNQ3i13ZaTe4IA7( _hcHHHtcgtSbo3xoBi4ebbzZLcSVjVX[ _PVB41p4KA03UsmQif3rzj5rWRy0x22], _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny[1] [_bK7HbIcPJKVNexVX06kCq8FuHo1ztf], _VkIlqVpX5RIpTJ4GqUgYx1H6SQK6Ny[2] [_bK7HbIcPJKVNexVX06kCq8FuHo1ztf])) return _NQ26Xm45nVhQRPlMbS4IscbNximbY4
def analyse(self, user_request, otuTable, headers, metaVals, taxonomy_map): otu_to_genus = {} if int(user_request.level) == -1: # We want to display a short hint for the OTU using the genus (column 5) for header in headers: if header in taxonomy_map and len(taxonomy_map[header]) > 5: otu_to_genus[header] = taxonomy_map[header][5] else: otu_to_genus[header] = "" groups = robjects.FactorVector(robjects.StrVector(metaVals)) # Forms an OTU only table (without IDs) allOTUs = [] col = 0 while col < len(otuTable[0]): allOTUs.append((headers[col], otuTable[:, col])) col += 1 od = rlc.OrdDict(allOTUs) dataf = robjects.DataFrame(od) catVar1 = user_request.get_custom_attr("pwVar1") catVar2 = user_request.get_custom_attr("pwVar2") minthreshold = user_request.get_custom_attr("minthreshold") fisherResults = self.rStats.fisher_exact(dataf, groups, catVar1, catVar2, int(minthreshold)) hints = {} results = [] i = 1 while i <= fisherResults.nrow: newRow = [] j = 1 while j <= fisherResults.ncol: if j > 1: newRow.append(round(float(fisherResults.rx(i, j)[0]), 6)) else: newRow.append(str(fisherResults.rx(i, j)[0])) j += 1 otu = newRow[0] if int(user_request.level) == -1: hints[otu] = otu_to_genus[otu] i += 1 results.append(newRow) cat1 = catVar1 cat2 = catVar2 abundancesObj = {} abundancesObj["results"] = results abundancesObj["hints"] = hints abundancesObj["cat1"] = cat1 abundancesObj["cat2"] = cat2 return abundancesObj
def test_kw_r(cls,feats,p,factors): robjects.globalenv["y"] = robjects.FloatVector(feats) for i,f in enumerate(factors): robjects.globalenv['x'+str(i+1)] = robjects.FactorVector(robjects.StrVector(cls[f])) fo = "y~x1" #for i,f in enumerate(factors[1:]): # if f == "subclass" and len(set(cls[f])) <= len(set(cls["class"])): continue # if len(set(cls[f])) == len(cls[f]): continue # fo += "+x"+str(i+2) kw_res = robjects.r('kruskal.test('+fo+',)$p.value') return float(tuple(kw_res)[0]) < p, float(tuple(kw_res)[0])
def predictTestData(train, test): pandas2ri.activate() ro.conversion.py2ri = ro.numpy2ri ro.numpy2ri.activate() df1 = pd.read_csv(train) df2 = pd.read_csv(train) df3 = pd.read_csv(test) df31 = np.asarray(pd.read_csv(test)) Xtrain = np.asarray(df1.iloc[:, :-1]) Xtest = np.asarray(df3.iloc[:, :-1]) ytr = df2.iloc[:, -1].values yte = np.random.randint(1, 3, len(df3)) irf = importr("iRF") auc = importr("AUC") base = importr("base") graphics = importr("graphics") grdevices = importr("grDevices") R = ro.r Xtr_mat1 = numpy2ri(Xtrain) Xte_mat1 = numpy2ri(Xtest) ytr_mat = ro.FactorVector(ytr) yte_mat = ro.FactorVector(yte) Xtr_mat = r.assign("bar", Xtr_mat1) Xte_mat = r.assign("bar", Xte_mat1) tempyte_mat = ytr_mat ncol = robjects.r('ncol') rep = robjects.r('rep') p1 = ncol(df2) p = p1[0] selprob = rep(1 / p, p) rf = robjects.r('list()') b = irf.randomForest(Xtr_mat, ytr_mat, Xte_mat, base.sample(yte_mat), selprob, ntree=400) print('Prediction finished') pred_1 = list(b[16][0]) return pred_1
def create_r_pre_data_frame(self, model): pre_data_frame = {} for i in model.variables: if i in model.factors: pre_data_frame[i] = robjects.FactorVector(self.demographic_data[i]) else: # Use either one of Int, Str, or Float vectors if self.demographic_data[i][0].dtype.type is np.int64: pre_data_frame[i] = robjects.IntVector(self.demographic_data[i]) elif self.demographic_data[i][0].dtype.type is np.float64: pre_data_frame[i] = robjects.FloatVector(self.demographic_data[i]) return pre_data_frame
def _make_model_matrix(self, columns=None, formula='~0+x'): """ Make the stats model matrix in R :param: formula: str; R formula character used to create the model matrix :return: R-matrix """ # Make an robject for the model matrix if columns is not None: r_sample_labels = robjects.FactorVector(columns) str_set = sorted(list(set(columns))) else: r_sample_labels = robjects.FactorVector(self.labels) str_set = sorted(list(set(self.labels))) # Create R formula object, and change the environment variable fmla = robjects.Formula(formula) fmla.environment['x'] = r_sample_labels # Make the design matrix. stats is a bound R package design = stats.model_matrix(fmla) design.colnames = robjects.StrVector(str_set) return design
def kNNClass(train_idx, test_idx, n_neighbors, k_mer_subset): logger.info('computing for %s' % (k_mer_subset)) train_idx = train_idx test_idx = test_idx training_subset = normalized_counts.loc[train_idx][np.append( k_mer_subset, "class")] testing_subset = normalized_counts.loc[test_idx][np.append( k_mer_subset, "class")] clf = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform") clf.fit(training_subset[k_mer_subset], training_subset["class"]) #print "predicting" predicted_classes = clf.predict(testing_data[k_mer_subset]) # compute kappa stat confusion_matrix(testing_data["class"], predicted_classes) # make a mapping class_map = dict(zip(set(testing_data["class"]), range(0, 4))) kapp = kappa([class_map[x] for x in testing_data["class"]], [class_map[x] for x in predicted_classes]) cm = caret.confusionMatrix(robjects.FactorVector(predicted_classes), robjects.FactorVector(testing_data["class"])) logger.info("Finished for %s with kappa==%f" % (k_mer_subset, kapp)) return kapp, cm
def get_deseq2_stats(df: pd.DataFrame, subsets: List[List[T]], min_total_row_count: int = 0) -> pd.DataFrame: """Use the R bioconductor package 'limma' to perform a differential expression analysis of count like data (e.g. miRNA). See package documentation for more details. :param df: Matrix of counts, where each column is a sample and each row a feature. :param subsets: The two subsets to compare with each other. :param min_total_row_count: Drop rows that have in total less than than min_total_row_count reads :return: Results of the analysis in form of a Dataframe (p, logFC, ...) """ logger.debug("Computing deseq2 stats") if len(subsets) != 2: error = "This method currently only supports exactly two " \ "subsets as this is the most common use case. Support " \ "for more subsets will be added later." logger.exception(error) raise ValueError(error) # flatten subset flattened_subsets = [x for subset in subsets for x in subset] # discard columns that are not in a subset df = df[flattened_subsets] # filter rows with too few reads total_row_counts = df.sum(axis=1) keep = total_row_counts[total_row_counts >= min_total_row_count].index df = df.loc[keep] # pandas df -> R df r_count_data = pandas2ri.py2ri(df) # py2ri is stupid and makes too many assumptions. # These two lines restore the column order r_count_data.colnames = list(OrderedDict.fromkeys(flattened_subsets)) r_count_data = r_count_data.rx(robj.StrVector(flattened_subsets)) # see package documentation condition = ['s{}'.format(i) for i, subset in enumerate(subsets) for _ in subset] r_condition = robj.FactorVector(robj.StrVector(condition)) r_col_data = r['DataFrame'](condition=r_condition) r_design = robj.Formula('~ condition') r_design.environment['condition'] = r_condition r_dds = r['DESeqDataSetFromMatrix'](r_count_data, r_col_data, r_design) r_dds = r['DESeq'](r_dds, parallel=True) r_res = r['results'](r_dds) # R result table to Python pandas r_res = r['as.data.frame'](r_res) results = pandas2ri.ri2py(r_res) results.insert(0, 'feature', list(r['row.names'](r_res))) return results
def mplotHis(moptions): perclist, rankgrouplist, rankperclist, split_points, myRankStr = group_rank( moptions) figname = moptions["FileID"] mresfolder = moptions['outFolder'] ggplot = importr('ggplot2') importr('gridExtra') spvector = robjects.IntVector(split_points) rankstrvector = robjects.StrVector(myRankStr) moptions['CaseSizes'].sort() csvector = robjects.IntVector(moptions['CaseSizes']) #mdfperc = robjects.DataFrame({"MixedPerc":robjects.FactorVector(robjects.FloatVector(perclist), levels=percvector, labels=percvector), "Rank":robjects.FactorVector(robjects.StrVector(rankgrouplist), levels=rankstrvector, labels=rankstrvector), "Fraction":robjects.FloatVector(rankperclist)}) mdfperc = robjects.DataFrame({ "MixedPerc": robjects.FactorVector(robjects.IntVector(perclist), levels=csvector, labels=csvector), "Percentile": robjects.FactorVector(robjects.StrVector(rankgrouplist), levels=rankstrvector, labels=rankstrvector), "Fraction": robjects.FloatVector(rankperclist) }) robjects.r(resource_string(__name__, 'Rscript/Hist_sim_plot.R')) robjects.r('pdf("' + mresfolder + '/hist2_' + figname + '.pdf", width=' + ("%.0f" % (len(moptions["CaseSizes"]) * 0.8)) + ', height=4, onefile = TRUE)') robjects.globalenv['Hist_sim_plot'](mdfperc, spvector, rankstrvector) robjects.r('dev.off()')
def createC50(self): #convert array to Factor R clas = robjects.FactorVector(self.clas) for i in self.trein.keys(): self.trein[i] = robjects.FloatVector(self.trein[i]) dataf = robjects.DataFrame(self.trein) ad = c50.C5_0(dataf, clas, triasl=20, control=(c50.C5_0Control(minCases=2, noGlobalPruning=True, CF=1))) self.ad = (base.summary(ad)) return ad
def generate_histogram(subgroups_to_sses_to_n_count, tname, file_name): columns_to_data = {'subgroup': [], tname: [], 'count': []} max_count = 0 for subgroup, sses_to_n_count in subgroups_to_sses_to_n_count.items(): for ss, n_count in sses_to_n_count.items(): columns_to_data['subgroup'].append(subgroup) columns_to_data[tname].append(ss) columns_to_data['count'].append(n_count) if n_count > max_count: max_count = n_count r_columns_to_data = { 'subgroup': ro.FactorVector(columns_to_data['subgroup'], levels=ro.StrVector( _sort_subgroup(set(columns_to_data['subgroup'])))), tname: ro.StrVector(columns_to_data[tname]), 'count': ro.IntVector(columns_to_data['count']) } df = ro.DataFrame(r_columns_to_data) max_count = int(max_count / 1000 * 1000 + 1000) histogram_file_path = os.path.join(OUTPUT_PATH, file_name) logging.debug( str.format("The Data Frame for file {}: \n{}", histogram_file_path, df)) grdevices.png(file=histogram_file_path, width=1200, height=800) gp = ggplot2.ggplot(df) pp = gp + \ ggplot2.aes_string(x='subgroup', y='count', fill=tname) + \ ggplot2.geom_bar(position="dodge",width=0.8, stat="identity") + \ ggplot2.theme_bw() + \ ggplot2.theme_classic() + \ ggplot2.theme(**{'legend.title': ggplot2.element_blank()}) + \ ggplot2.theme(**{'legend.text': ggplot2.element_text(size=40)}) + \ ggplot2.theme(**{'axis.text.x': ggplot2.element_text(size=40,angle=45)}) + \ ggplot2.theme(**{'axis.text.y': ggplot2.element_text(size=40)}) + \ ggplot2.scale_y_continuous(expand=ro.IntVector([0, 0]), limits=ro.IntVector([0, max_count])) + \ ggplot2.geom_text(ggplot2.aes_string(label='count'), size=6, angle=35, hjust=-0.1, position=ggplot2.position_dodge(width=0.8), vjust=-0.2) pp.plot() logging.info(str.format("Output step3 file {}", histogram_file_path)) grdevices.dev_off()
def test_kw_r(cls,feats,p,factors): ###cls= class with all information ###feats = relative abundance ###p = p-value sets in params ###factors= informations about the class component robjects.globalenv["y"] = robjects.FloatVector(feats) for i,f in enumerate(factors): robjects.globalenv['x'+str(i+1)] = robjects.FactorVector(robjects.StrVector(cls[f])) fo = "y~x1" #for i,f in enumerate(factors[1:]): # if f == "subclass" and len(set(cls[f])) <= len(set(cls["class"])): continue # if len(set(cls[f])) == len(cls[f]): continue # fo += "+x"+str(i+2) ###Try to retrive the p-value from here kw_res = robjects.r('kruskal.test('+fo+',)$p.value') ###kw_res results in p-value return float(tuple(kw_res)[0]) < p, float(tuple(kw_res)[0])
def dtk_multi_metrics(gather_df, col_indices=list(range(10)) + list(range(15, 20)) + list(range(25, 65)) + list(range(145, 149)) + list(range(150, 183)), alpha=0.05): metric_names = list(gather_df.columns.values[col_indices]) model_names = list(gather_df.index.levels[0]) dtk_dict = {} dtk_lib = rpackages.importr('DTK') #drop fold means and medians gather_df = gather_df[metric_names] gather_df = gather_df.xs('test_metrics', level='set') gather_df = gather_df.drop('Folds Mean', level='fold') gather_df = gather_df.drop('Folds Median', level='fold') #get fold count model_names_rep = [] for m in model_names: k = gather_df.xs(m, level='model').shape[0] model_names_rep.extend([m for _ in range(k)]) index_names_1 = [] index_names_2 = [] for i in range(len(model_names)): for j in range(i+1, len(model_names)): index_names_1.append(model_names[j]) index_names_2.append(model_names[i]) for i, metric in zip(range(len(metric_names)), metric_names): m_df = gather_df[metric] m_df.sort_index(inplace=True) m_df = m_df.loc[model_names] m_df_mat = np.around(m_df.as_matrix(), decimals=4) dtk_results = dtk_lib.DTK_test(robjects.FloatVector(m_df_mat), robjects.FactorVector(model_names_rep), alpha) dtk_results = np.array(dtk_results[1]) dtk_pd = pd.DataFrame(data=[index_names_1, index_names_2, list(dtk_results[:,0]),list(dtk_results[:,1]),list(dtk_results[:,2]), [False for _ in range(len(index_names_1))]]).T dtk_pd.columns = ['group1', 'group2', 'meandiff', 'Lower CI', 'Upper CI', 'reject'] for j in range(dtk_pd.shape[0]): if dtk_pd.iloc[j,3] > 0 or dtk_pd.iloc[j,4] < 0: dtk_pd.iloc[j,5] = True dtk_dict[metric] = dtk_pd return dtk_dict
def analyse(self, user_request, otuTable, headers, metaVals, taxonomy_map): otu_to_genus = {} if int(user_request.level) == -1: # We want to display a short hint for the OTU using the genus (column 5) for header in headers: if header in taxonomy_map and len(taxonomy_map[header]) > 5: otu_to_genus[header] = taxonomy_map[header][5] else: otu_to_genus[header] = "" groups = robjects.FactorVector(robjects.StrVector(metaVals)) allOTUs = [] col = 0 while col < len(otuTable[0]): allOTUs.append((headers[col], otuTable[:, col])) col += 1 od = rlc.OrdDict(allOTUs) dataf = robjects.DataFrame(od) pval = user_request.get_custom_attr("pval") maxruns = user_request.get_custom_attr("maxruns") borutaResults = self.rStats.boruta(dataf, groups, float(pval), int(maxruns)) assignments = {} hints = {} i = 0 for lab in borutaResults.iter_labels(): if lab in assignments: assignments[lab].append(allOTUs[i][0]) else: assignments[lab] = [allOTUs[i][0]] if int(user_request.level) == -1: hints[allOTUs[i][0]] = otu_to_genus[allOTUs[i][0]] i += 1 abundancesObj = {} abundancesObj["results"] = assignments abundancesObj["hints"] = hints return abundancesObj
def _edger_func_fit(the_data, the_groups, the_method): if the_method not in {'GLM', 'QLGLM'}: raise NotImplementedError("Only GLM and QLGLM methods are supported at present") fit = None rdata = pandas2ri.py2ri(the_data) rgroups = robjects.FactorVector(the_groups) y = r("DGEList")(rdata) y = r("calcNormFactors")(y) formula = robjects.Formula("~0 + groups") formula.environment['groups'] = rgroups design = r("model.matrix")(formula) design.colnames = r('levels')(rgroups) y = r("estimateDisp")(y, design) if the_method == 'GLM': fit = r('glmFit')(y, design) elif the_method == 'QLGLM': fit = r('glmQLFit')(y, design) return fit, design
def multi(jurisdictions, causes): with openrlib.rlock: r = ro.r ro.globalenv['jurisdictions'] = jurisdictions ro.globalenv['causes'] = causes r_df = r.source('farrington_multiselect.R') cause_fac = ro.FactorVector(r_df[0][1]) cause_group = [cause_fac.levels[i - 1] for i in r_df[0][1]] week_ending = [serial_date_to_string(i) for i in r_df[0][6]] observed = [int(i) for i in r_df[0][7]] alarm = [['', 'x'][int(i)] for i in r_df[0][9]] upperbound = [float(i) for i in r_df[0][10]] lowerbound = [float(i) for i in r_df[0][11]] df = { 'week_ending': week_ending, 'observed': observed, 'alarm': alarm, 'upperbound': upperbound, 'lowerbound': lowerbound } pass return df
def _filter_expression(self, data): """ Filter low expressing genes :param data: R data.frame :return: """ # Get the groups. Unclear if this is used groups = robjects.FactorVector(self.labels) # Make DGEList (Digital Gene Expression) from counts dge = edgeR.DGEList(counts=data, group=groups) # Decide which genes to filter keep = edgeR.filterByExpr(dge, self.design) # boolean filter, keep all columns, keep.lib.size=False dge = dge.rx(keep, True, False) # Renorm dge = edgeR.calcNormFactors(dge) return dge