def _load_folds_from_rdata(data_file, fold_id): """ (internal) returns folds from RData file in the pipeline :param data_file: :param fold_id: :param inner_fold_id: :return: """ fold_id = validate_fold_id(fold_id) r_variables = "data_file='%s'; fold_id='%s'" % (data_file, fold_id) import rpy2.robjects as rn from .rpy2_helper import r_clear if is_inner_fold_id(fold_id): r_cmd = """raw_data = new.env() load(data_file, envir=raw_data) folds = raw_data$cvindices[[fold_id]][,1] """ else: r_cmd = """raw_data = new.env() load(data_file, envir=raw_data) folds = raw_data$cvindices[[substring(fold_id, 1, 3)]][, as.double(substr(fold_id, 5, 6))] """ rn.reval(r_variables) rn.reval(r_cmd) folds = np.array(rn.r['folds']) folds = validate_folds(folds, fold_id) r_clear() return folds
def create_excel_file(self, filename: str) -> None: """ Create the Excel result file based on the `reactome_obj` in the R session :param filename: Path to the Excel file that will be created """ # inject the result path ri.globalenv["excel_result_file"] = ri.StrSexpVector([filename]) ro.reval(""" # get the pathways table pathway_result <- pathways(reactome_obj) # create the Excel file library(xlsx) # add the combined pathway data write.xlsx2(pathway_result, file = excel_result_file, sheetName = "Pathways", col.names = T, row.names = T, append = F) # add the expression values for every result if ("fold_changes" %in% result_types(reactome_obj)) { for (dataset_name in names(reactome_obj)) { fold_changes <- get_result(reactome_obj, type = "fold_changes", name = dataset_name) # add the fold-changes to the Excel file write.xlsx2(fold_changes, file = excel_result_file, sheetName = paste0(dataset_name, " - fold changes"), col.names = T, row.names = F, append = T) } } """)
def test_df_to_str(self): # create the test data.frame ro.reval(""" test_frame = data.frame( name = c("John", "Doe"), age = c(1, 2), row.names = c("Id1", "Id2") ) test_frame_2 = data.frame( name = c("John", "Doe"), age = c(1.12345, 2.12345), row.names = c("Id1", "Id2") ) """) r_data_frame = ri.globalenv["test_frame"] string_df = ReactomeRAnalyser.data_frame_to_string(r_data_frame) self.assertIsNotNone(string_df) self.assertEqual("\\tname\\tage\\nId1\\tJohn\\t1.0\\nId2\\tDoe\\t2.0", string_df) # check the precision r_data_frame2 = ri.globalenv["test_frame_2"] string_df2 = ReactomeRAnalyser.data_frame_to_string(r_data_frame2) self.assertIsNotNone(string_df2) self.assertEqual( "\\tname\\tage\\nId1\\tJohn\\t1.12345\\nId2\\tDoe\\t2.12345", string_df2)
def _mark_timestamp(self, blSegsL): """ mark segs in final sample """ # 此处应用R来进行求解 # 首先,求解每相邻数据的基线之差的集合 # # 或直接列出所有基线 # 然后,根据相邻数据的基线之差,映射到数据的非基线之上,确定归宿于哪一个 # 基线之差 # # 或找出落入基线之中的最大索引 # 最后,所有的数据点中最先落入基线之差的为目标时间戳 # # 根据该索引作为时间戳 from rpy2.robjects.packages import importr from rpy2.robjects import IntVector, StrVector, globalenv import rpy2.robjects as robjects GR = importr('GenomicRanges') IR = importr('IRanges') GRL = GR.GRangesList() globalenv["GRL"] = GRL for blSegs, idx in zip(blSegsL, range(len(blSegsL))): chromNames = StrVector([seg.chromName for seg in blSegs]) starts = IntVector([seg.start for seg in blSegs]) ends = IntVector([seg.end for seg in blSegs]) tempGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends)) globalenv["tempGR"] = tempGR robjects.r("GRL[[{0}]]=tempGR".format(str(idx+1))) GRL = robjects.r["GRL"] # 此处由于list中保存的是指向目标Seg的指针,所以更新nonBLSegs即可 nonBlSegs = list(set(self._segPoolL[-1].segments) - set(blSegsL[-1])) chromNames = StrVector([seg.chromName for seg in nonBlSegs]) starts = IntVector([seg.start for seg in nonBlSegs]) ends = IntVector([seg.end for seg in nonBlSegs]) nonBlGR = GR.GRanges(seqnames = chromNames, ranges=IR.IRanges(starts, ends)) # fo = IR.findOverlaps(nonBlGR, GRL) # For large SCNA fo = IR.findOverlaps(nonBlGR, GRL, minoverlap=5000) globalenv["fo"] = fo robjects.reval("fom <- as.matrix(fo)") overlapIdx = np.array(list(robjects.r.fom)).reshape(tuple(reversed(robjects.r.fom.dim))) - 1 # [[2, 2, 3, 3], # [1, 2, 1, 2]] # print overlapIdx for index in set(overlapIdx[0,]): yIdxes = np.where(overlapIdx[0,]==index)[0] ts = np.max(overlapIdx[1,yIdxes]+1) nonBlSegs[index].tag = str(ts)
def _load_processed_data_rdata(file_name): import rpy2.robjects as rn rn.reval("data = new.env(); load('%s', data)" % file_name) r_data = rn.r.data data_fields = list(rn.r.data.keys()) loaded_data = dict() for xf, yf, sw in [('X', 'Y', 'sample_weights'), ('X_test', 'Y_test', 'sample_weights_test'), ('X_validation', 'Y_validation', 'sample_weights_validation')]: if xf in data_fields and yf in data_fields and len(np.array( r_data[yf])) > 0: loaded_data[yf] = np.array(r_data[yf]).flatten() loaded_data[yf][loaded_data[yf] == 0] = -1 loaded_data[xf] = np.array(r_data[xf]) if loaded_data[xf].shape[1] == len(loaded_data[yf]): loaded_data[xf] = np.transpose(loaded_data[xf]) if sw in data_fields: loaded_data[sw] = np.array(r_data[sw]).flatten() if 'variable_names' in data_fields: loaded_data['variable_names'] = np.array( rn.r.data['variable_names']).tolist() elif 'X_headers' in data_fields: loaded_data['variable_names'] = np.array( rn.r.data['X_headers']).tolist() elif 'X_header' in data_fields: loaded_data['variable_names'] = np.array( rn.r.data['X_header']).tolist() if 'outcome_name' in data_fields: loaded_data['outcome_name'] = np.array(r_data['outcome_name'])[0] elif 'Y_headers' in data_fields: loaded_data['outcome_name'] = np.array(r_data['Y_headers'])[0] elif 'Y_header' in data_fields: loaded_data['outcome_name'] = np.array(r_data['Y_header'])[0] if 'format' in data_fields: loaded_data['format'] = np.array(r_data['format'])[0] if 'partitions' in data_fields: loaded_data['partitions'] = np.array(rn.r.data['partitions']).tolist() cvindices = _load_cvindices_from_rdata(file_name) data = set_defaults_for_data(loaded_data) return data, cvindices
def r_save_to_disk(file_name, variables_to_save=None): if variables_to_save is None: save_command = "save(file='%s')" % file_name else: save_string = ", ".join(variables_to_save) save_command = "save(%s, file='%s')" % (save_string, file_name) rn.reval(save_command) assert (os.path.isfile(file_name)) return True
def _save_data_as_rdata(file_name, data, cvindices): import rpy2.robjects as rn from .rpy2_helper import r_assign, r_save_to_disk from rpy2.robjects import pandas2ri data = set_defaults_for_data(data) assert check_data(data) fields_to_save = [ "format", "Y", "sample_weights", "outcome_name", "variable_names" ] try: for k in fields_to_save: r_assign(data[k], k) except: from eqm.debug import ipsh ipsh() r_assign(cvindices, "cvindices") pandas2ri.activate() X_df = pd.DataFrame(data=data['X']) X_df.columns = data['variable_names'] rn.r.assign('X', X_df) # test set has_test_set = ('X_test' in data) and ('Y_test' in data) and ('sample_weights_test' in data) if has_test_set: X_test_df = pd.DataFrame(data=data['X_test']) X_test_df.columns = data['variable_names'] rn.r.assign('X_test', pandas2ri.py2ri(X_test_df)) r_assign(data['Y_test'], 'Y_test') r_assign(data['sample_weights_test'], 'sample_weights_test') else: rn.reval(""" X_test = matrix(data=NA, nrow = 0, ncol = ncol(X)); Y_test = matrix(data=NA, nrow = 0, ncol = 1); sample_weights_test = matrix(data=1.0, nrow = 0, ncol = 1); """) pandas2ri.deactivate() variables_to_save = fields_to_save + [ "cvindices", "X", "X_test", "Y_test", "sample_weights_test" ] r_save_to_disk(file_name, variables_to_save) return True
def run(self) -> None: try: # inject the analysis_result into the R session ri.globalenv["analysis_result_json"] = ri.StrSexpVector( [self.analysis_result.decode()]) # inject the metadata ri.globalenv["include_interactors"] = ri.BoolSexpVector( [self.report_request.include_interactors]) ri.globalenv["include_disease"] = ri.BoolSexpVector( [self.report_request.include_disease]) # create the analysis result object LOGGER.debug("Creating result R object...") ro.reval(""" library(ReactomeGSA) # convert the reactome object result_obj <- jsonlite::fromJSON(analysis_result_json) reactome_obj <- ReactomeGSA:::convert_reactome_result(result_obj) """) # create the Excel file LOGGER.debug("Creating Excel file ...") excel_filename = "/tmp/result_" + self.report_request.analysis_id + ".xlsx" self.create_excel_file(excel_filename) self.result_queue.put(excel_filename) # create the PDF report LOGGER.debug("Creating PDF report...") pdf_filename = "/tmp/result_" + self.report_request.analysis_id + ".pdf" self.create_pdf_report(pdf_filename) self.result_queue.put(pdf_filename) # create the R script LOGGER.debug("Creating R script...") r_filename = "/tmp/result_" + self.report_request.analysis_id + ".r" self.create_r_script(r_filename) self.result_queue.put(r_filename) except Exception as e: # put the error message in the queue LOGGER.error("Error during report generation: " + str(e)) self.result_queue.put(e) finally: LOGGER.debug("Setting on_complete") self.on_complete.set()
def install(download_dir): local_filename = "%s/MEGSA_beta.zip" % download_dir if not os.path.exists(local_filename): filename, response = urllib.urlretrieve(MEGSA_URL, local_filename) else: filename = local_filename nbsupport.util.check_digest(filename, "655e1ec48a67530672303d8ac7fbc925") with zipfile.ZipFile(local_filename) as archive: with io.TextIOWrapper(archive.open("version beta/MEGSA.R")) as stream: robjects.reval(stream.read()) global megsa def megsa(events): s = robjects.r.funEstimate(events.T).rx2("S")[0] return 0.5 * scipy.stats.chisqprob(s, 1) + 0.5 * int(s == 0)
def scran_normalize(adata): import numpy as np import rpy2.robjects as ro from rpy2.robjects import numpy2ri from rpy2.robjects.packages import importr importr('scran') numpy2ri.activate() ro.r.assign('mat', adata.X.T) qclust_params = 'mat' # qclust_params = f'mat, min.size={min_size}, max.size={max_size}' ro.reval(f'cl <- quickCluster({qclust_params})') csf_params = f'mat, clusters=cl' # csf_params = f'mat, clusters=cl, min.mean={min_mean}' sf = np.asarray(ro.reval(f'computeSumFactors({csf_params})')) adata.obs['sf'] = sf adata.layers['counts'] = adata.X.copy() adata.X /= adata.obs['sf'].values[:, None] numpy2ri.deactivate() return adata
def create_pdf_report(self, pdf_filename) -> None: """" Create the PDF report based on the `reactome_obj` in the current R session. :param pdf_filename: The target filename of the PDF report. Will be overwritten if it exists. """ # inject the path used to store the xlsx file ri.globalenv["pdf_result_file"] = ri.StrSexpVector([pdf_filename]) try: ro.reval(""" library(ReactomeGSA.report) options(tinytex.verbose = TRUE) # create the report create_pdf_report(reactome_obj, pdf_result_file, include_disease = include_disease, include_interactors = include_interactors) """) except Exception as e: LOGGER.error("RRuntimeError: " + str(e))
def getProbFactors(fitted, events, evidence): env = Environment() env['fitted'] = fitted str_cpquery = "cpquery(fitted,({}),({}))" suppVals = [ reval(str_cpquery.format(event, evidence), envir=env)[0] for event in events ] return suppVals
def hugeLearnGraph(X, nonPara = True, method = "mb", nTunings = 20, modelSelectCrit = "ric", ebicTuning = 0.5, lambdaRatio = None, verbose = False): n,d = X.shape if nonPara: #X = hugeR.huge_npn(X, verbose = verbose) # transform the data X = transform(X, returnNumpyArray= False, verbose = verbose) asR = robjects.r['as'] if method == "mb" and modelSelectCrit == "mbDefault": # single value for the regularization parameter alpha = 0.05 lambbda = 2/np.sqrt(n)*norm.ppf(1 - alpha/(2*d**2)) # cannot pass "lambda = x" argument to hugeR.huge function directly because "lambda" is illegal variable name.. lambda_r = robjects.FloatVector([lambbda]) robjects.rinterface.globalenv['lambda_r'] = lambda_r # define variable in R environment lambdaArg = robjects.reval('lambda = lambda_r') # this can be passed est= hugeR.huge(X, lambdaArg, method = method, verbose = verbose, sym = "and") #print(robjects.r['get']('method',est)) #print(robjects.r['get']('sym',est)) path = robjects.r['get']('path',est)[0] estG = np.array(asR(path,"matrix"),dtype = np.int) else: if lambdaRatio is not None: est= hugeR.huge(X,lambda_min_ratio = lambdaRatio, method = method, nlambda = nTunings, verbose = verbose) else: # estimate graphs for a range of hyperparameters est= hugeR.huge(X, method = method, nlambda = nTunings, verbose = verbose) # set seed to obtain reproducable results (depends on the numpy global seed) this affects only if criterion is "ric" or "stars" seed = np.random.randint(1,1e9) robjects.r['set.seed'](seed) #ebic_r = robjects.FloatVector([ebicTuning]) #robjects.rinterface.globalenv['ebicGamma'] = ebic_r # define variable in R environment #ebicArg = robjects.reval('ebic.gamma = ebicGamma') # this can be passed # do model selection methodRes = hugeR.huge_select(est,ebic_gamma = ebicTuning, criterion = modelSelectCrit, verbose = verbose) G_R = robjects.r['get']('refit',methodRes) asR = robjects.r['as'] estG = np.array(asR(G_R,"matrix"),dtype = np.int) # sparsity/ies for the estimated graph(s) sparsity = np.array(robjects.r['get']('sparsity',est)) return(estG,sparsity)
def slingshot(adata, start, n_pcs=5, cl=None): import numpy as np import pandas as pd import rpy2.robjects as ro from rpy2.robjects import numpy2ri, pandas2ri from rpy2.robjects.packages import importr importr('slingshot') numpy2ri.activate() pandas2ri.activate() ro.r.assign('pca', adata.obsm['X_pca'][:, :n_pcs]) ro.r.assign('cl', adata.obs[cl]) ro.reval('sds <- newSlingshotDataSet(pca, cl)') ro.reval(f'sce <- slingshot(sds, cl, start.clus="{start}")') pt = pd.DataFrame(np.asarray(ro.reval('slingPseudotime(sce)')), index=adata.obs_names) pt.columns = [f'{cl}_lineage_{c}' for c in pt.columns] try: adata.obs = adata.obs.drop(pt.columns, axis=1) except KeyError: print('PT keys not dropped in obs dataframe: Not found.') adata.obs = pd.concat([adata.obs, pt], axis=1) adata.uns['slingshot'] = {} adata.uns['slingshot']['lineages'] = {} lineages = np.asarray(np.asarray(ro.reval('sce@lineages'))) for i, l in enumerate(lineages): adata.uns['slingshot']['lineages'][i] = list(np.asarray(l)) numpy2ri.deactivate() pandas2ri.deactivate() return adata
def __init__(self, mtype, seed, nmaxcomp): if mtype == 1: ro.reval(""" set.seed({}) y <- rnorm(50, sample(c(-2, 0, 1, 2), 50, TRUE, c(.3, .2, .2, .3))) """.format(seed + 1)) self.y = np.array(ro.r['y'], dtype=np.float64) self.transformation = "logit" def truedensity(y_): #y_unc = scipy.special.logit(y_) dnorm = stats.norm.pdf results = .3 * dnorm(y_, -2) + .2 * dnorm(y_, 0) +\ .2 * dnorm(y_, 1) + .3 * dnorm(y_, 2) return results #abs jacobian transform #return results / abs(y_ - y_**2) self.truedensity = truedensity elif mtype == 2: ro.reval(""" set.seed({}) Ns <- as.vector(rmultinom(1, 50, c(.2, .25, .35, .2))) y <- c(rbeta(Ns[1], 1.3, 1.3), rbeta(Ns[2], 1.1, 3), rbeta(Ns[3], 5, 1), rbeta(Ns[4], 1.5, 4)) """.format(seed + 1)) self.y = np.array(ro.r['y'], dtype=np.float64) self.transformation = None def truedensity(y_): dbeta = stats.beta.pdf return (.20 * dbeta(y_, 1.3, 1.3) + .25 * dbeta(y_, 1.1, 3) + .35 * dbeta(y_, 5, 1) + .20 * dbeta(y_, 1.5, 4)) self.truedensity = truedensity elif mtype == 3: ro.reval(""" set.seed({}) y <- rbeta(50, 2, 5) """.format(seed + 1)) self.y = np.array(ro.r['y'], dtype=np.float64) self.transformation = None def truedensity(y_): return stats.beta.pdf(y_, 2, 5) self.truedensity = truedensity elif mtype == "real": tab = pd.read_csv("data.csv") tab.dropna(inplace=True) self.y = tab.loc[tab.iloc[:, 0] == seed].iloc[:, 1] self.y = np.array(self.y, dtype=np.float64) self.n = self.y.size self.transformation = {"transf": "fixed", "vmin": 50, "vmax": 107} self.phi = npc.fourierseries(self.y, nmaxcomp)
def _load_folds_from_rdata(data_file, fold_id): """ (internal) returns folds from RData file in the pipeline :param data_file: :param fold_id: :param inner_fold_id: :return: """ if os.path.isfile(data_file): file_extension = data_file.rsplit('.')[-1] assert file_extension.lower( ) == 'rdata', 'unsupported file extension: %r' % file_extension else: raise IOError('could not find data_file: %s' % data_file) fold_id = validate_fold_id(fold_id) r_variables = "data_file='%s'; fold_id='%s'" % (data_file, fold_id) import rpy2.robjects as rn from .rpy2_helper import r_clear if is_inner_fold_id(fold_id): r_cmd = """raw_data = new.env() load(data_file, envir=raw_data) folds = raw_data$cvindices[[fold_id]][,1] """ else: r_cmd = """raw_data = new.env() load(data_file, envir=raw_data) folds = raw_data$cvindices[[substring(fold_id, 1, 3)]][, as.double(substr(fold_id, 5, 6))] """ rn.reval(r_variables) rn.reval(r_cmd) folds = np.array(rn.r['folds']) folds = validate_folds(folds, fold_id) r_clear() return folds
def r_assign(pval, rname, print_flag=False): assert len(rname) > 0 assert type(rname) is str if type(pval) is dict: return_flag = r_assign_list(pval, rname) else: return_flag = r_assign_value(pval, rname) if print_flag: print(rn.reval("print(%s)" % rname)[0]) return return_flag
def r_assign_list(pdict, rname): rn.reval("%s = list();" % rname) for key in pdict: r_assign_str(key, "field_name") r_assign_value(pdict[key], "field_value") rn.reval("%s[[field_name]] = field_value" % rname) rn.reval("rm(field_name); rm(field_value);") return True
def knitr(absfile): i = absfile.rindex('/') j = absfile.find('.', i) s = absfile[0:j] figpath = s + '_img/' reval = """ knitr::opts_chunk$set(echo=FALSE, fig.path='{figpath}') library('knitr') knit('{absfile}') """.format(figpath=figpath, absfile=absfile) print('--------------------------------------') print(reval) print(type(reval)) r2 = robjects.reval(reval) fname = r2[0] f = open(fname, 'r') c = f.read() f.close() return c
def r_clear(): rn.reval('rm(list=ls());')
def analyse_request(self, request, gene_set_mappings, identifier_mappings, gene_set): # clean the environment ro.reval("rm(list=ls())") # get the pathway id to name mapping pathway_names = self.dict_of_list_to_r(gene_set.gene_set_names) # process every dataset separately analysis_results = list() previous_progress = 0.3 for dataset in request.datasets: # make sure the dataset has a design if dataset.design is None: raise AnalysisException( "Dataset '" + dataset.name + "' does not contain an experimental design.") LOGGER.debug("Analysing dataset " + dataset.name) # get the gene index gene_index = self.dict_of_list_to_r( gene_set_mappings[dataset.name].gene_set_indices, value_type=int) # prepare the dataset for the analysis - including pre-processing (expression_data, sample_data, design) = \ self._prepare_dataset(dataset=dataset) self._update_status("Analysing dataset '{}' using {}".format( dataset.name, request.method_name), complete=previous_progress + (0.3 / len(request.datasets))) LOGGER.debug("Starting GSA...") result = self._perform_gsa( method=request.method_name, parameters=getattr(dataset, "parameter_dict", dict()), expression_data=expression_data, sample_data=sample_data, design=design, gene_index=gene_index, data_type=dataset.type, pathway_names=pathway_names, comparison_group_1=dataset.design.comparison.group1, comparison_group_2=dataset.design.comparison.group2) self._update_status("Analysing dataset '{}' using {}".format( dataset.name, request.method_name), complete=previous_progress + (0.5 / len(request.datasets))) LOGGER.debug("Estimating fold changes...") fold_changes = self._estimate_gene_fc( method=request.method_name, parameters=getattr(dataset, "parameter_dict", dict()), expression_data=expression_data, sample_data=sample_data, design=design, data_type=dataset.type, comparison_group_1=dataset.design.comparison.group1, comparison_group_2=dataset.design.comparison.group2) self._update_status("Analysing dataset '{}' using {}".format( dataset.name, request.method_name), complete=previous_progress + (0.7 / len(request.datasets))) LOGGER.debug("Adding pathway fold changes...") # add average fold-changes to the analysis result # pylint: disable=no-member result = ReactomeRAnalyser.preprocess.add_pathway_foldchanges( result, fold_changes, gene_index, expression_data) LOGGER.debug("Creating the analysis result...") analysis_result = AnalysisResultResults( name=dataset.name, pathways=ReactomeRAnalyser.data_frame_to_string(result), fold_changes=ReactomeRAnalyser.data_frame_to_string( fold_changes)) analysis_results.append(analysis_result) previous_progress += 0.7 / len(request.datasets) LOGGER.debug("Dataset analysis complete") return analysis_results
""" import pandas as pd from rpy2.robjects import pandas2ri pandas2ri.activate() from rpy2.robjects import r r.data('prediction') df_iris = pandas2ri.ri2py(r[prediction]) from rpy2 import robjects Rdir = "I:/DOCUMENTS/WEGC/02_PhD_research/03_Data/ZAMG/SPARTACUS/TMAX/rda/Tx20130227.rda" f = r'/Tx20130227.rda' obj = Rdir + f m=robjects.r('matrix(1:6, nrow=2, ncol=3)') m = robjects.reval(obj) rdf = 'I:/DOCUMENTS/WEGC/02_PhD_research/03_Data/ZAMG/SPARTACUS/TMAX/rda/Tx20130227.rda' pandas2ri.ri2py(rdf) test = pd.read_csv(r'C:\Users\Kaddabadda\Documents\test.csv', index_col = [0])
def _save_data_as_rdata(file_name, data, cvindices): import rpy2.robjects as rn from .rpy2_helper import r_assign, r_save_to_disk from rpy2.robjects import pandas2ri data = set_defaults_for_data(data) assert check_data(data) fields_to_save = [ "format", "Y", "sample_weights", "outcome_name", "variable_names", "variable_types", "variable_orderings" ] if data['format'] == FORMAT_NAME_RULES: fields_to_save += [ "feature_groups", "feature_names", "feature_types", "feature_orderings", "feature_group_limits" ] elif data['format'] == FORMAT_NAME_DCP: fields_to_save += ['partitions'] try: for k in fields_to_save: r_assign(data[k], k) except: from dcptree.debug import ipsh ipsh() r_assign(cvindices, "cvindices") # feature matrix var_type_to_col_type = { 'boolean': 'bool', 'categorical': 'str', 'numeric': 'float', 'ordinal': 'str', } col_types = { n: var_type_to_col_type[data['variable_types'][n]] for n in data['variable_names'] } pandas2ri.activate() X_df = pd.DataFrame(data=data['X']) X_df.columns = data['variable_names'] X_df = X_df.astype(col_types) rn.r.assign('X', X_df) # test set has_test_set = ('X_test' in data) and ('Y_test' in data) and ('sample_weights_test' in data) if has_test_set: X_test_df = pd.DataFrame(data=data['X_test']) X_test_df.columns = data['variable_names'] X_test_df = X_test_df.astype(col_types) rn.r.assign('X_test', pandas2ri.py2ri(X_test_df)) r_assign(data['Y_test'], 'Y_test') r_assign(data['sample_weights_test'], 'sample_weights_test') else: rn.reval(""" X_test = matrix(data=NA, nrow = 0, ncol = ncol(X)); Y_test = matrix(data=NA, nrow = 0, ncol = 1); sample_weights_test = matrix(data=1.0, nrow = 0, ncol = 1); """) pandas2ri.deactivate() variables_to_save = fields_to_save + [ "cvindices", "X", "X_test", "Y_test", "sample_weights_test" ] r_save_to_disk(file_name, variables_to_save) return True
def anova2(x, print_summary=False): """ Uses {car} Anova in R, via rpy2, to compute two-way repeeated measures anova. ``x`` should be a pandas dataframe of the form:: factor_a a0 a1 factor_b b0 b1 b0 b1 0 0.3 0.35 0.44 0.49 1 0.5 0.47 0.92 1.20 2 43.7 42.60 18.10 17.40 3 3.8 4.50 9.20 10.40 4 18.2 17.60 21.30 21.90 5 22.4 23.10 19.30 19.80 ... In the above, there are two "factors", which we have called "factor_a" and "factor_b". Here, each of the two factors has two "levels": ["a0", "a1"] and ["b0", "b1"]. Note that it is the fact that we have two *factors* that makes this a two-way anova, you *can* have more than two levels in each factor. This module includes ``_dummy_data_anova2``, which you can use here. Note on sphericity: The sphericity requirement is, roughly speaking, that all *pairs* of levels within a given factor must have roughly the same covariance. i.e. the "information" about a repeated measure is distributed evenly across all the levels rather than some of the levels being more correlated to each other than others. Note that if there are only two levels then there is only one covariance, so sphericity must be valid (see http://stats.stackexchange.com/a/59206). When sphericity is violated, the way to compensate is to reduce the number of degrees of freedom. There are three similar ways of doing this. The most convervative is the "lower bound", then "Greenhouse-G", then "Huynh-F". You are recommended to just look at the Greenhouse-G values. (see https://youtu.be/wkMwW_2_TzY?t=40m34s). Returns a namedtuple with three pandas dataframes: univariate_tests, sphericity_tests, and pval_adjustments There is also an attribute "full_dict", the values of which are rpy2 objects and provide the full output of the anova. You need to install R, rpy2, and the car package in R. Good luck. DM, Jun 2015. """ from rpy2.robjects import pandas2ri from rpy2.robjects.packages import importr import rpy2.robjects as R pandas2ri.activate() car = importr("car") level_values = x.columns.to_series().reset_index().drop(0, axis=1) level_names = x.columns.names x = x.copy() x.columns = [xx[0] + xx[1] for xx in x.columns] R.globalenv["data_x"] = R.r["as.matrix"](pandas2ri.py2ri(x)) anova_r = car.Anova(R.r.lm("data_x ~ 1"), idata=pandas2ri.py2ri(level_values), idesign=R.reval("~" + "*".join(level_names))) R.r.rm("data_x") ret = R.r.summary(anova_r) if print_summary: print ret full_dict = {k.replace(".", "_"): v for k, v in ret.items()} def to_df(v): try: return pd.DataFrame(pandas2ri.ri2py(v), columns=v.colnames, index=v.rownames) except TypeError: return None return ANOVA2(univariate_tests=to_df(full_dict["univariate_tests"]), sphericity_tests=to_df(full_dict["sphericity_tests"]), pval_adjustments=to_df(full_dict["pval_adjustments"]), full_dict=full_dict)
def analyse_request(self, request: AnalysisInput, gene_set_mappings, identifier_mappings, gene_set): # clean the environment ro.reval("rm(list=ls())") # get the pathway id to name mapping pathway_names = self.dict_of_list_to_r(gene_set.gene_set_names) # load the analyser R code LOGGER.debug("Processing data using R analysis code for {}".format(request.method_name.lower())) analysis_package = self.methods[request.method_name.lower()] LOGGER.debug("Retrieved analysis package") # load the libraries try: analysis_package.load_libraries() except Exception as e: LOGGER.critical("Failed to load required package: " + str(e)) raise AnalysisException("Failed to load required R package") LOGGER.debug("R libraries loaded") # get the analysis-level parameters analysis_parameters = getattr(request, "parameter_dict", dict()) # indicates whether the visualization should be disabled disable_visualization = False # if pathways are filtered using a list of pathways disable visualization if len(analysis_parameters.get("pathways", "")) > 0: disable_visualization = True # process every dataset separately analysis_results = list() previous_progress = 0.3 for dataset in request.datasets: # create save sample names org_names = dataset.design.samples if dataset.design else dataset.df.dtype.names[1:] sample_names = self._create_save_names(org_names) # if there are more then MAX_SAMPLES, disable the visualization if len(sample_names) > ReactomeGSVARAnalyser.MAX_SAMPLES: disable_visualization = True LOGGER.debug("Converting expression data") # convert the expression data to an R matrix expression_data = self._convert_dataset(dataset, sample_names) # convert the fold_changes to the text representation # pylint: disable=no-member expression_data_id = ReactomeRAnalyser.preprocess.change_first_column(expression_data, rowname_column=ri.StrSexpVector(["Identifier"])) r_fold_change_text = ReactomeRAnalyser.data_frame_to_string(expression_data_id) LOGGER.debug("Converting gene_index") # get the gene index gene_index = self.dict_of_list_to_r(gene_set_mappings[dataset.name].gene_set_indices, value_type=int) self._update_status("Analysing dataset '{}' using {}".format(dataset.name, request.method_name), complete=previous_progress + (0.3 / len(request.datasets))) # perform the analysis LOGGER.debug("Starting GSVA analysis for {}".format(dataset.name)) # use float before int to support scientific notation for max_size. This happens for large # numbers in the R package max_size = int(float(analysis_parameters.get("max_size", 1_000_000))) result = analysis_package.process(expression_data, gene_index, ri.StrSexpVector([dataset.type]), # These parameters are currently not visible to the user as # it might cause inconsistencies in the reactome result conversion ri.IntSexpVector([int(analysis_parameters.get("min_size", 0))]), ri.IntSexpVector([max_size]), ri.StrSexpVector([analysis_parameters.get("pathways", "")])) # add the pathway's name # pylint: disable=no-member result = ReactomeRAnalyser.preprocess.add_pathway_names(result, pathway_names) LOGGER.debug("GSVA analysis completed for {}".format(dataset.name)) self._update_status("Analysing dataset '{}' using {}".format(dataset.name, request.method_name), complete=previous_progress + (0.6 / len(request.datasets))) # convert the data.frame to a string r_text_result = ReactomeRAnalyser.data_frame_to_string(result) # add the result analysis_results.append(AnalysisResultResults(name=dataset.name, pathways=r_text_result, fold_changes=r_fold_change_text)) previous_progress += 0.7 / len(request.datasets) LOGGER.debug("Returning combined analysis result") # disable the visualization if set if disable_visualization: if not hasattr(request, "parameter_dict"): request.parameter_dict = dict() request.parameter_dict["create_reactome_visualization"] = "False" return analysis_results
import rpy2.robjects as r import numpy as np r.require("splines") r.reval("x = rnorm(5)") r.reval("x = bs( x, df = 5)") np.array(list(r.r.x)) m = r.bs(x, df=5) r.reval('m1 <- c(1:10)') r.reval("m2 <- matrix(as.complex(m1), nrow=5)") np.array(list(r.r.m2)).reshape(r.r.m2.dim)
def make_outfile(stan_file, data_file, author): print(stan_file) model_fname = stan_file[stan_file.rfind('/') + 1:-5] data_fname = stan_file[data_file.rfind('/') + 1:-5] f_stan = open(stan_file, 'r') stan_text = f_stan.readlines() # Parse stan file flag_data = False flag_parameters = False flag_model = False data_num = 0 param_num = 0 model_num = 0 param_type = [] data_type = [] param_name = [] data_name = [] param_size = [] data_size = [] param_bounds = [] data_bounds = [] v_mu = [] v_sigma = [] v_bounds = [] calc_order = [] for text_line in stan_text: # highlight relevant sections if ('data {' in text_line): if ('transformed' not in text_line): flag_data = True elif ('parameters {' in text_line): if ('transformed' not in text_line): flag_parameters = True elif ('model {' in text_line): flag_model = True # handle each section elif (flag_data == True): if ('}' in text_line): flag_data = False else: [data_type, data_size, data_name, data_bounds, data_num] = stan_read(text_line, data_type, data_size, data_name, data_bounds, data_num) elif (flag_parameters == True): if ('}' in text_line): flag_parameters = False else: [param_type, param_size, param_name, param_bounds, param_num] = stan_read(text_line, param_type, param_size, param_name, param_bounds, param_num) f_stan.close() f_data = open(data_file, 'r') data_text = f_data.readlines() flag_array = False flag_start = False data_value = {} # Parse data file for text_line in data_text: if ('<-' in text_line): if (flag_array == True): flag_array = False data_value[index] = np.array(robjects.reval(array_string)) elif (flag_start == True): Exception('Variable has unassigned value in data file') name_end = text_line.find('<') name = remove_whitespace(text_line[0:name_end], False) index = try_find(name, data_name) if (index != -1): if (data_size[index] == 1): if ('c(' in text_line): raise Exception( 'Specified data sizes do not match those in the data file' ) else: value_start = text_line.rfind('-') + 1 value = remove_whitespace(text_line[value_start:], False) if (is_number(value) == True): if (value[-1:] == '\n'): data_value[index] = value[:-1] else: data_value[index] = value else: flag_start = True else: if ('c(' in text_line): flag_array = True value_start = text_line.find('-') + 1 if (text_line[-2:] == '\n'): array_string = text_line[value_start:-3] else: array_string = text_line[value_start:] else: flag_start = True elif (flag_array == True): if (text_line[-2:] == '\n'): array_string += text_line[:-3] else: array_string += text_line elif (flag_start == True): if (data_size[index] == 1): if ('c(' in text_line): raise Exception( 'Specified data sizes do not match those in the data file' ) else: if (text_line[-1:] == '\n'): value = remove_whitespace(text_line[:-1], False) else: value = remove_whitespace(text_line, False) if (is_number(value) == True): data_value[index] = value flag_start = False else: if ('c(' in text_line): flag_array = True #value_start = text_line.rfind('c') #array_string = text_line[value_start:] if (text_line[-2:] == '\n'): array_string = text_line[:-3] else: array_string = text_line flag_start = False if (flag_array == True): data_value[index] = np.array(robjects.reval(array_string)) f_data.close() #if len(data_value) != len(data_name): # raise Exception('Specified data sizes do not match those in the data file') eval_param_size = [] if (param_num > 0): for size_string in param_size: if (isinstance(size_string, int)): eval_param_size.append(size_string) elif (isinstance(size_string, str)): # not list, only one element to deal with eval_param_size.append( eval(get_eval_string(size_string, data_name, data_value))) else: eval_param_size.append(1) for size_string_element in size_string: # evaluate elements and multiply together to get total size #print(get_eval_string(size_string_element,data_name,data_value)) eval_param_size[-1] *= eval( get_eval_string(size_string_element, data_name, data_value)) # get keywords from directory index = -1 ind = 0 keywords = [] url = 'https://github.com/stan-dev/example-models' while ind < len(stan_file) - 5: if (stan_file[ind] == '/'): ind += 1 index += 1 if (ind < len(stan_file) - 5): keywords.append(stan_file[ind]) url += '/' + stan_file[ind] ind += 1 else: url += '/' break elif (ind == 0): index += 1 keywords.append(stan_file[ind]) url += '/' + stan_file[ind] ind += 1 else: keywords[index] += stan_file[ind] url += stan_file[ind] ind += 1 outf = open("{}.json".format(model_fname), "w") # write .json file outf.write('{\n') outf.write(' "name": "{}",\n'.format(model_fname)) outf.write(' "keywords": {},\n'.format(json.dumps(keywords))) outf.write(' "urls": "{}",\n'.format(url)) outf.write(' "model_name": "{}",\n'.format(model_fname)) outf.write(' "data_name": "{}",\n'.format(data_fname)) outf.write(' "reference_posterior_name": null,\n') outf.write(' "references": [],\n') outf.write(' "dimensions": {\n') if (len(param_name) > 1): for n in range(0, len(param_name) - 1): outf.write(' "{}": {},\n'.format(param_name[n], eval_param_size[n])) outf.write(' "{}": {}\n'.format(param_name[-1], eval_param_size[-1])) else: outf.write(' "{}": {}\n'.format(param_name[0], eval_param_size[0])) outf.write(' },\n') outf.write(' "added_date": "{}",\n'.format(date.today())) outf.write(' "added_by": "{}"\n'.format(author)) outf.write('}')
def mast_de(adata, key, perc=-1, covs=''): import numpy as np import pandas as pd import rpy2.robjects as ro from rpy2.robjects import numpy2ri, pandas2ri from rpy2.robjects.packages import importr ro.reval('rm(list=ls())') importr('MAST') numpy2ri.activate() pandas2ri.activate() ro.reval('options(mc.cores=2)') ro.r.assign( 'mat', pd.DataFrame(adata.raw.X, index=adata.obs_names, columns=adata.raw.var_names)) print('Filtering genes') if perc > 0: ro.reval('mat <- mat[,colSums(mat > 0) > length(colnames(mat)) * ' + str(perc) + ']; dim(mat);') else: ro.reval('mat <- mat[,colSums(mat > 0) > ' + str(adata.obs[key].value_counts().min() - 1) + ']; dim(mat);') if adata.obs.shape[1] == 0: adata.obs['Barcode'] = list(adata.obs.index) ro.r.assign('cdat', adata.obs) ro.reval('fdat <- as.data.frame(colnames(mat)); \ row.names(fdat) <- fdat[,1]') ro.reval('raw <- FromMatrix(t(mat), cdat, fdat)') print('Data loaded') de = None for group in adata.obs[key].cat.categories: print(f'Group {group}') cmd = 'group <- colData(raw)$' + key + '; \ levels(group) <- factor(c(unique(group), "-1")); \ group[group != "' + group + '"] = "-1"; \ colData(raw)$group <- group; \ zlmCond <- zlm(~group + n_genes' + covs + ', raw); \ summaryCond <- summary(zlmCond, doLRT="group' + group + '"); \ summaryDT <- summaryCond$datatable; \ fcHurdle <- merge(summaryDT[contrast=="group' + group + '" & component=="H",.(primerid, `Pr(>Chisq)`)], summaryDT[contrast=="group' + group + '" & component=="logFC", .(primerid, coef, ci.hi, ci.lo)], by="primerid"); \ fcHurdle[,fdr:=p.adjust(`Pr(>Chisq)`, "fdr")];' ro.reval(cmd) index = list(ro.reval('data.frame(fcHurdle)$primerid')) coef = list(ro.reval('data.frame(fcHurdle)$coef')) fdr = list(ro.reval('data.frame(fcHurdle)$fdr')) group_de = pd.DataFrame( { f'{group}_coef': coef, f'{group}_fdr': fdr }, index=index, ) de = pd.concat([de, group_de], axis=1) return de
def _load_cvindices_from_rdata(data_file): """ (internal) cvindices object stored in a RData file in the pipeline :param data_file: """ import rpy2.robjects as rn from .rpy2_helper import r_clear r_variables = "data_file = '%s'" % data_file r_cmd = """ raw_data = new.env() load(data_file, envir=raw_data) all_fold_ids = names(raw_data$cvindices) list2env(raw_data$cvindices, globalenv()) remove(raw_data, cvindices); """ rn.reval(r_variables) rn.reval(r_cmd) all_fold_ids = np.array(rn.r['all_fold_ids']) cvindices = {} max_fold_value = 0 for key in all_fold_ids: try: folds = np.array(rn.r[key]).flatten() max_fold_value = max(max_fold_value, np.max(folds)) cvindices[key] = folds except Exception: warnings.warn('failed to load fold %s from file %s' % (key, data_file)) pass #cast as unsigned integers to save storage space if max_fold_value < 255: storage_type = 'uint8' elif max_fold_value < 65535: storage_type = 'uint16' elif max_fold_value < 4294967295: storage_type = 'uint32' else: storage_type = 'uint64' for key in cvindices.keys(): cvindices[key] = cvindices[key].astype(storage_type) #break down matrices to just folds all_keys = list(cvindices.keys()) for key in all_keys: if key[0] == 'K' and len(key) == 3: fold_matrix = cvindices.pop(key) n_repeats = fold_matrix.shape[1] for r in range(n_repeats): folds = np.array(fold_matrix[:, r]) folds = folds.flatten() folds = folds.astype(storage_type) fold_id = '%sN%02d' % (key, r) cvindices[fold_id] = folds # cleanup in the R environment just in case r_clear() return cvindices
def r_assign_empty_list(rname): assert len(rname) > 0 assert type(rname) is str rn.reval('%s = list()' % rname) return True
def run(self) -> None: try: # inject the r_file_path into the R session ri.globalenv["filename"] = ri.StrSexpVector([self.r_file_path]) # process the file using the R session LOGGER.debug("Extracting expression data from R file") ro.reval(""" # load the file load(filename) # make sure it only contains 1 experiment if (length(experimentSummary) != 1) { stop("Error: Unexpected number of experiments") } # get the metadata metadata <- NA data_type <- NA expression_values <- NA # test if it's RNA-seq or microarray if (is(experimentSummary[[1]], "RangedSummarizedExperiment")) { # load the required library library(SummarizedExperiment) data_type <- "rnaseq_counts" metadata <- colData(experimentSummary[[1]]) expression_values <- data.frame(assays(experimentSummary[[1]])$counts) } else if (is(experimentSummary[[1]], "ExpressionSet")) { # load the required library library(Biobase) # TODO: test for microarray normalisation data_type <- "microarray_norm" metadata <- pData(experimentSummary[[1]]) expression_values <- data.frame(exprs(experimentSummary[[1]])) } else { stop("Error: Unknown assay type encountered.") } # add a "sample.id" column for the metadata metadata$sample.id <- rownames(metadata) metadata <- metadata[, c(ncol(metadata), 1:(ncol(metadata)-1))] # convert to a standard data.frame metadata <- data.frame(metadata) # add a "gene" column for the expression values expression_values$gene <- rownames(expression_values) expression_values <- expression_values[, c(ncol(expression_values), 1:(ncol(expression_values)-1))] """) # always check whether to exit if self.exit: return # convert the R objects to python objects LOGGER.debug("Converting R objects to python") data_type = str(ri.globalenv["data_type"][0]) self.heartbeat() LOGGER.debug("Converting metadata data.frame to string") metadata_string = ReactomeRAnalyser.data_frame_to_string( ri.globalenv["metadata"]) self.heartbeat() if self.exit: return LOGGER.debug("Converting expression data.frame to string") expression_value_string = ReactomeRAnalyser.data_frame_to_string( ri.globalenv["expression_values"]) self.heartbeat() if self.exit: return # save the result and mark the on_complete event LOGGER.debug("Returning results through the queue") self.result_queue.put({ 'data_type': data_type, 'metadata': metadata_string, 'expression_values': expression_value_string }) self.on_complete.set() except Exception as e: # put the error message in the queue LOGGER.error("Error during loading of R file: " + str(e)) self.result_queue.put(e) finally: LOGGER.debug("Setting on_complete") self.on_complete.set()