def convert_rdata_to_dataframe ( filename ) : # from rpy2.robjects import r as R from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri from rpy2.robjects.conversion import localconverter import rpy2.robjects as ro # print ( 'WARNING THIS PROGRAM NEED VALUE ERROR CHECKING' ) rd_ = R.load( filename ) if 'matrix' in str( type( R[rd_[0]] ) ).lower() : column_names = [ R[rd_[0]].colnames ] index_names = [ R[rd_[0]].rownames ] else : column_names = [ [r for r in _rd_.colnames] for _rd_ in R[rd_[0]]] index_names = [ [r for r in _rd_.rownames] for _rd_ in R[rd_[0]]] # pandas2ri.activate() # # SMALL HELPER FUNCTION THAT TRANSFORMS A RDATA OBJECT INTO # A PANDAS DATAFRAME. CURRENTLY THERE IS NO VALUE ERROR CHECKING # rd = R.load( filename ) raw_df_l = [] if 'ndarray' in str( type( R[rd[0]] ) ).lower() : [ raw_df_l.append( R[rd[0]] ) ] else : [ raw_df_l.append( rdf ) for rdf in ro.vectors.DataFrame(R[rd[0]]) ] full_df_dict = {} ; i_ = 0 for raw_df,colnames,rownames in zip( raw_df_l,column_names,index_names ) : pdf = pd.DataFrame( raw_df , columns=colnames , index=rownames ) full_df_dict[i_] = pdf i_ = i_ + 1 pandas2ri.deactivate() return ( full_df_dict )
def tune_model(scenario, params, log): irace_command = create_command(scenario, params, log) # Run irace process = subprocess.Popen(irace_command.split(), stdout=subprocess.PIPE) output, error = process.communicate() # Load the irace output and transdorm it to a csv file pandas2ri.activate() r.load(log.as_posix()) r.library('irace') b = r.getFinalElites(r.iraceResults, n=0) with localconverter(default_converter + pandas2ri.converter): r_pd_params = b out_csv = Path(log.parent, log.stem + '.csv') r_pd_params.to_csvfile(out_csv.as_posix(), sep=',') # Import to pandas to pythonic cleaning py_pd_params = pd.read_csv(out_csv) py_pd_params = py_pd_params.drop(columns=['.ID.', '.PARENT.']) # Giving the correct order to rows correct_table = {} for c in py_pd_params.columns: rows = [] for j in py_pd_params[c].keys(): rows.append(py_pd_params[c][j]) correct_table[c] = rows correct_table = pd.DataFrame(data=correct_table) correct_table.to_csv(out_csv)
def some_rpy2(): flash('Loading data...please wait') r.load('mtu_inf_111813.RData') pm = r['predictor.mats'] pmm = pm.rx(1) dataframe = r['data.frame'] df = dataframe(pmm) firstcol = df.rx(1) seccol = df.rx(2) lattice = importr('lattice') xyplot = lattice.xyplot rprint = robjects.globalenv.get("print") #formula = Formula('firstcol ~ seccol') #formula.getenvironment()['firstcol'] = df.rx2(1) #formula.getenvironment()['seccol'] = df.rx2(2) #p = lattice.xyplot(formula) grdevices = importr('grDevices') #filenm = app.config['IMGS_FOLDER'] + 'hist.png' filenm = 'hist.png' # why is this in tmp still??? grdevices.png(file=filenm, width=512, height=512) p = r.histogram(df.rx2(1)) rprint(p) # works grdevices.dev_off() return render_template("hist.html", image='static/tmp/hist.png')
def get_gene_model(gene, ref_panel, weights_dir='fusion_twas/WEIGHTS'): weight_file = f'{weights_dir}/{ref_panel}/{gene}_500kb.wgt.RDat' assert os.path.exists(os.path.dirname(weight_file)) if not os.path.exists(weight_file): print(f'WARNING: weight file {weight_file} missing for gene {gene}! ' f'Removing from the analysis.') return None r.load(weight_file) performance = r['cv.performance'][0] sorted_order = np.argsort(performance) best_model_index = sorted_order[-1] # model with highest performance if best_model_index == 2: best_model_index = sorted_order[-2] # if top1 is best, take 2nd-best model_weights = r['wgt.matrix'][:, best_model_index] if np.isnan(model_weights).all(): print(f'WARNING: Best model for gene {gene} has all-nan weights! ' f'Removing from the analysis.') return None if (model_weights == 0).all(): print(f'WARNING: Best model for gene {gene} has all-0 weights! ' f'Removing from the analysis.') return None rs_numbers = r.snps['V2'].values assert len(model_weights) == len(rs_numbers) model_weights = pd.Series(data=model_weights, index=rs_numbers) # Remove SNPs with 0 weight (or nan weight) model_weights = model_weights[model_weights != 0] return model_weights
def plotIDR(output_file, input_prefixes): '''create IDR plots. This code is taken from the R script batch-consistency-plot.r within the IDR package. ''' dirname = os.path.dirname(__file__) R.source(os.path.join(dirname, "WrapperIDR.r")) R('''df.txt = 10''') R('''uri.list <- list() uri.list.match <- list() ez.list <- list() legend.txt <- c() em.output.list <- list() uri.output.list <- list()''') npair = len(input_prefixes) for x, input_prefix in enumerate(input_prefixes): R.load(input_prefix + "-uri.sav") R.load(input_prefix + "-em.sav") i = x + 1 R('''uri.output.list[[%(i)i]] <- uri.output; em.output.list[[%(i)i]] <- em.output; # reverse =T for error rate;''' % locals()) R(''' ez.list[[%(i)i]] <- get.ez.tt.all(em.output, uri.output.list[[%(i)i]]$data12.enrich$merge1, uri.output.list[[%(i)i]]$data12.enrich$merge2);''' % locals()) R(''' # URI for all peaks uri.list[[%(i)i]] <- uri.output$uri.n; # URI for matched peaks uri.match <- get.uri.matched(em.output$data.pruned, df=df.txt); uri.list.match[[%(i)i]] <- uri.match$uri.n; ''' % locals()) legend = "%(i)i = %(input_prefix)s" % locals() R(''' legend.txt[%(i)i] <- '%(legend)s'; ''' % locals()) R.pdf(output_file) R('''par(mfcol=c(2,3), mar=c(5,6,4,2)+0.1)''') R('''plot.uri.group(uri.list, NULL, file.name=NULL, c(1:%(npair)i), title.txt="all peaks"); plot.uri.group(uri.list.match, NULL, file.name=NULL, c(1:%(npair)i), title.txt="matched peaks"); plot.ez.group(ez.list, plot.dir=NULL, file.name=NULL, legend.txt=c(1:%(npair)i), y.lim=c(0, 0.6)); plot(0, 1, type="n", xlim=c(0,1), ylim=c(0,1), xlab="", ylab="", xaxt="n", yaxt="n"); legend(0, 1, legend.txt, cex=0.6);''' % locals()) R["dev.off"]()
def plotIDR( output_file, input_prefixes ): '''create IDR plots. This code is taken from the R script batch-consistency-plot.r within the IDR package. ''' dirname = os.path.dirname(__file__) R.source(os.path.join( dirname, "WrapperIDR.r")) R('''df.txt = 10''') R('''uri.list <- list() uri.list.match <- list() ez.list <- list() legend.txt <- c() em.output.list <- list() uri.output.list <- list()''') npair = len(input_prefixes) for x, input_prefix in enumerate(input_prefixes): R.load( input_prefix + "-uri.sav" ) R.load( input_prefix + "-em.sav" ) i = x + 1 R( '''uri.output.list[[%(i)i]] <- uri.output; em.output.list[[%(i)i]] <- em.output; # reverse =T for error rate;''' % locals()) R(''' ez.list[[%(i)i]] <- get.ez.tt.all(em.output, uri.output.list[[%(i)i]]$data12.enrich$merge1, uri.output.list[[%(i)i]]$data12.enrich$merge2);''' % locals()) R(''' # URI for all peaks uri.list[[%(i)i]] <- uri.output$uri.n; # URI for matched peaks uri.match <- get.uri.matched(em.output$data.pruned, df=df.txt); uri.list.match[[%(i)i]] <- uri.match$uri.n; ''' % locals() ) legend = "%(i)i = %(input_prefix)s" % locals() R(''' legend.txt[%(i)i] <- '%(legend)s'; '''% locals()) R.pdf( output_file ) R('''par(mfcol=c(2,3), mar=c(5,6,4,2)+0.1)''') R('''plot.uri.group(uri.list, NULL, file.name=NULL, c(1:%(npair)i), title.txt="all peaks"); plot.uri.group(uri.list.match, NULL, file.name=NULL, c(1:%(npair)i), title.txt="matched peaks"); plot.ez.group(ez.list, plot.dir=NULL, file.name=NULL, legend.txt=c(1:%(npair)i), y.lim=c(0, 0.6)); plot(0, 1, type="n", xlim=c(0,1), ylim=c(0,1), xlab="", ylab="", xaxt="n", yaxt="n"); legend(0, 1, legend.txt, cex=0.6);''' % locals()) R["dev.off"]()
def pandas_load(name): ''' loads .rdata file (R dataframe file) and returns it as Pandas dataframe. :param name: .rdata filename (eg: 'subset.Rdata') :return: pandas dataframe object ''' pandas2ri.activate() r.load(name) # name = 'subset.fcuk.Rdata' # name_without_ext = r['.'.join(name.split('.')[-2::-1][::-1])] # print(r.ls()) # ls() - list of active objects in R env df = pandas2ri.ri2py(r[r.ls()[0]]) return df
def get_rdata(url): # For testing, probably want to do this a different way in production TODO response = urllib2.urlopen(url) html = response.read() fp = open("rdata" + url.replace("http://data.war-on-ice.net", "").replace("http://war-on-ice.com", ""), "w") fp.write(html) fp.close() robj = r.load("rdata" + url.replace("http://data.war-on-ice.net", "").replace("http://war-on-ice.com", "")) rdata = {} keys = {} for sets in robj: myRData = pandas2ri.ri2py(r[sets]) rdata[sets] = [] keys[sets] = set() # convert to DataFrame if not isinstance(myRData, pd.DataFrame): myRData = pd.DataFrame(myRData) for element in myRData: keys[sets].add(element) counter = 0 for value in myRData[element]: if counter >= len(rdata[sets]): rdata[sets].append({}) rdata[sets][counter][element] = value counter += 1 return rdata
def get_rdata(url): # For testing, probably want to do this a different way in production TODO response = urllib2.urlopen(url) html = response.read() fp = open( "rdata" + url.replace("http://data.war-on-ice.net", "").replace( "http://war-on-ice.com", ""), "w") fp.write(html) fp.close() robj = r.load("rdata" + url.replace( "http://data.war-on-ice.net", "").replace("http://war-on-ice.com", "")) rdata = {} keys = {} for sets in robj: myRData = pandas2ri.ri2py(r[sets]) rdata[sets] = [] keys[sets] = set() # convert to DataFrame if not isinstance(myRData, pd.DataFrame): myRData = pd.DataFrame(myRData) for element in myRData: keys[sets].add(element) counter = 0 for value in myRData[element]: if counter >= len(rdata[sets]): rdata[sets].append({}) rdata[sets][counter][element] = value counter += 1 return rdata
def __init__(self, path): if not os.path.isfile(path): errormessage = "PEXO output not found in the specified path: {}".format( path) raise FileNotFoundError(errormessage) self.path = path self.contents = r.load(path)
def _download_and_import_RData_file(url): filename, headers = urlretrieve(url) # Load the RData file into R and get the name of the new variable created r_obj_name = r.load(filename)[0] # Load that variable and convert to a pandas DataFrame df = pandas2ri.ri2py(r[r_obj_name]) return df
def prelude1_Rdata(connection): # print(parse_csv_with_DatesAndIDs('subset.csv')) # виправляю weather прямо в sql (один раз :) ): # inserter(con_in,'Weather','City_date') # завантажую датасети: connection.row_factory = sqlite3.Row cur = connection.cursor() r.load('flights.Rdata') # name = 'subset.Rdata' # r('diablo <- df[1350500:5819079, ]') for i in range(5819079 // 10000 + 1): # 5819079 5819 79 cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='Delays{}".format(i)) delays = pandas_load_by_parts(r, i) # weather = pd.read_sql('select * from Weather', con_in) # корегую імена колонок: # di = {} # for item in weather.axes[1]: # di[item] = item.strip() # print(item.strip()) # weather.rename(columns=di, inplace=True) # записую в sql: # weather.to_sql('Weather', con_out) delays.to_sql('Delays_2nd_{}'.format(i), connection)
def _read_ExpressionSet_RData(RData): """Read ExpressionSet RData to Rpy2 robjects. RData: Path to the input RData file. ExpressionSet must be the only object in the RData. Return Rpy2's eSet object, assayData, featureData, phenotypeData. """ importr('Biobase') rdata = r.load(RData) eSet = r.get(rdata) # rpy2 ExpressionSet object (assumed) assayData = r.assayData(eSet) # rpy2 environment object fData = r.fData(eSet) # rpy2 DataFrame object pData = r.pData(eSet) # rpy2 DataFrame object return eSet, assayData, fData, pData
def get_rdata(url): try: robj = r.load(r.url(url)) rdata = {} keys = {} for sets in robj: myRData = pandas2ri.ri2py(r[sets]) rdata[sets] = [] keys[sets] = set() # convert to DataFrame if not isinstance(myRData, pd.DataFrame): myRData = pd.DataFrame(myRData) for element in myRData: keys[sets].add(element) counter = 0 for value in myRData[element]: if counter >= len(rdata[sets]): rdata[sets].append({}) rdata[sets][counter][element] = value counter += 1 r.closeAllConnections() return rdata except: return None
def fromRData(sce_class, rdata): rs4_object = robjects.r[r.load(rdata)[0]] sce = sce_class.fromRS4(rs4_object) sce.rs4 = rs4_object return sce
def matrix(self): pandas2ri.activate() matrix = robjects.r[r.load("./tests/gene_marker_matrix.rdata")[0]] print("RHO", matrix.shape) return matrix
weight_dir = f'fusion_twas/WEIGHTS/{ref_panel}' trait_to_sumstats = { 'CAD': 'fusion_twas/cardiogramplusc4d/cad.sumstats', 'LDL': 'fusion_twas/LDL/ldl.sumstats', 'Crohns': 'fusion_twas/crohns/crohns.sumstats', } sumstats_file = trait_to_sumstats[trait_name] rs_to_z = pd.read_table(sumstats_file, usecols=['SNP', 'Z'], index_col='SNP')['Z'] rs_to_p = pd.Series(2 * np.where(rs_to_z > 0, norm.sf(rs_to_z), norm.cdf(rs_to_z)), index=rs_to_z.index) dfs = {} models = 'blup', 'lasso', 'top1', 'enet', 'prs' for gene in locus_genes: weight_file = f'{weight_dir}/{gene}_500kb.wgt.RDat' r.load(weight_file) performance = r['cv.performance'][0] sorted_order = np.argsort(performance) best_model_index = sorted_order[-1] # model with highest performance if best_model_index == 2: best_model_index = sorted_order[-2] # if top1 is best, take 2nd-best df = pd.DataFrame({'weight': r['wgt.matrix'][:, best_model_index]}, index=r.snps['V2'].values) df = df[df['weight'] != 0] df['p'] = df.index.map(rs_to_p.__getitem__) # map() doesn't work with dicts when used on indices df = df.sort_values('p') if gene == causal_gene: GWAS_hits = df.index[:6] # noinspection PyUnboundLocalVariable for GWAS_hit in GWAS_hits: df[f'LD with {GWAS_hit}'] = df.index.map(lambda rs: get_LD( chrom, rs, GWAS_hit, use_STARNET=True)) dfs[gene] = df
def items(): r.load("{}.rdata".format("items123")) d = pandas2ri.ri2py(r["items123"]) s = pd.Series(index=map(int, d[: len(d) // 2]), data=d[len(d) // 2 :]) s.to_pickle("items.pd") return s
def convert_from_R(name): r.load("{}.RData".format(name)) df = pandas2ri.ri2py(r["res_max2"]) df = convert_to_data_format(df) df.to_pickle("{}.pd".format(name)) return df
def read_comadre(self): r.load(self.path) self._raw_data = r.comadre
def get_data(cohort, assembly='tophatV04', normalization='raw', filter='none', counter='htseq', sync=True, metaCounts=False, countDir='data/in', phenoFile='data/in.RData', filterFile=None): """ Loads data from RData count files in `countDir` and pheno data from 'phenoDir'. Example ------- >>> counts, pheno = get_data('ukd1') Parameters ---------- cohort String Determines the cohort to load. (e.g. 'ukd1', 'ukd2', 'ukd4', 'osr1', 'osr2', 'ut1', 'prad', 'pcap') assembly String The assembly used to define the features that were counted. (e.g. 'tophatV04' or 'stringtieV05') normalization String Which normalization of the counts to use. (e.g. 'raw', 'cpm', 'tpm', 'vst', 'rlog', 'normTransform') filter String The filter applied befor normalizing the data. (e.g. 'none' or 'q9') counter String The tool and summarization to count the features. e.g. 'kalGene' stands for kallisto counts that where summarized to gene counts. sync Bool A boolean indicating whether counts and pheno data should be restricted to common samples and ordered the equally. metaCounts Bool A boolean indicating whether features that start with '__' (e.g. '__alignment_not_unique') should be included. countDir Path A director where the expression data is stored in files `<normalization>-none-<cohort>-<assembly>-counts.RData`. phenoFile Path Path to a file that contains the pheno data as R dataframe. filterFile Path A text file containing the names of features that should be used. Value ----- The function returns the two objects `counts, pheno`. counts pandas.dataframe A pandas data frame containing the counts with samples as columns and features as rows. pheno pandas.dataframe A pandas data frame containing the phenotypic data with samples as rows. """ # check types stringArgs = [cohort, assembly, normalization, filter, counter] isString = [isinstance(arg, str) for arg in stringArgs] if (not all(isString)): raise TypeError( "All arguments but `sync` and `metaCounts` musst be strings.") if not isinstance(sync, bool): raise TypeError("The argument `sync` musst be boolean value.") if not isinstance(metaCounts, bool): raise TypeError("The argument `metaCounts` musst be boolean value.") # check normalization norms = [ 'raw', 'tpm-normalized', 'vst-normalized', 'rlog-normalized', 'normTransform-normalized', 'cpm-normalized' ] normInd = [n.startswith(normalization) for n in norms] nHits = sum(normInd) if (nHits < 1): raise ValueError( "The argument `normalization` needs to be an abbreviation for one of " + ', '.join(norms)) elif (nHits > 2): raise ValueError( "The abbreviation in the argument `normalization` is ambiguous.") else: normTerm = list(compress(norms, normInd))[0] if (normTerm == 'cpm-normalized'): normTerm = 'raw' doCPM = True else: doCPM = False if (counter == 'kallisto'): counter = 'kalGene' if (counter == 'htseq'): fnParts = [normTerm, filter, cohort, assembly] else: fnParts = [normTerm, filter, cohort, assembly, counter] # look for counts file cFileName = '-'.join(fnParts) + '-counts.RData' countsFile = os.path.join(countDir, cFileName) if (not os.path.isfile(countsFile)): raise IOError('There is no counts file for the given parameters: ' + countsFile) pandas2ri.activate() # look for pheno file if (sync and not os.path.isfile(phenoFile)): raise IOError('The pheno data file could not e found: ' + phenoFile) elif (not os.path.isfile(phenoFile)): warnings.warn('The pheno data file could not e found: ' + phenoFile) pheno = pd.DataFrame() else: r.load(phenoFile) r('ind <- sapply(pheno, mode) == "logical"') r('pheno[ind] = lapply(pheno[ind], as.numeric)' ) # only float arrays have `NaN` r('pheno <- subset(pheno, Cohort != "UKDP1")' ) # remove duplicate samples pheno = r('pheno').set_index('ID') # load counts file r.load(countsFile) r('colnames(Counts) <- make.names(colnames(Counts))') if filterFile is not None: if not os.path.isfile(filterFile): raise IOError( f'The given filter file `{filterFile}` does not exist.') r(f'features <- readLines("{filterFile}")') r('Counts <- subset(Counts, rownames(Counts) %in% features)') counts = pd.DataFrame(r('Counts'), index=r('rownames(Counts)'), columns=r('colnames(Counts)')) if (not metaCounts): normalFeatures = [not f.startswith('__') for f in counts.index.values] counts = counts.loc[normalFeatures, :] if (doCPM): counts = 1e6 * counts / counts.apply(sum, axis=0) if (sync): commonSample = sorted(set(pheno.index).intersection(counts.columns)) counts = counts[commonSample] pheno = pheno.loc[commonSample] return counts, pheno