def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2): """" Invoke a named "correlation" R metric and convert the R dataframe result into a Pandas dataframe. :param r_stream1: an r_stream object :param r_stream2: an r_stream object :param metric_function_name: the name of the set of metrics :return: """ R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric') # NOTE: Conversion of dataframes only works if you activate but we don't want conversion # NOTE: to always be automatic so we deactivate() after we're done converting. pandas2ri.activate() r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1) r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2) pandas2ri.deactivate() # TODO: Can we just activate/deactivate before/after R_function() without converting # TODO: r_evalresp1/2 ahead of time? # Calculate the metric r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2) r_dataframe = _R_metricList2DF(r_metriclist) pandas2ri.activate() df = pandas2ri.ri2py_dataframe(r_dataframe) pandas2ri.deactivate() # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) return df
def mbpls_ade4(data, num_comp=20): from rpy2 import robjects from rpy2.robjects import pandas2ri import time x1, x2, y = data pandas2ri.activate( ) # to activate easy conversion from r to pandas dataframes # to generate a dataframe in R global environment robjects.globalenv['x1'] = pandas2ri.py2ri_pandasdataframe(x1) robjects.globalenv['x2'] = pandas2ri.py2ri_pandasdataframe(x2) robjects.globalenv['ref'] = pandas2ri.py2ri_pandasdataframe(y) start_time = time.time() # how to execute R code using the global environment variables defined above robjects.r(''' library(ade4) library(adegraphics) dudiY.act <- dudi.pca(ref, center = TRUE, scale = TRUE, scannf = FALSE,nf=1) ktabX.act <- ktab.list.df(list(mainPerformance=x1,mainInput=x2)) resmbpls.act <- mbpls(dudiY.act, ktabX.act, scale = TRUE, option = "none", scannf = FALSE, nf=''' + str(num_comp) + ''') bip <- resmbpls.act$bip ''') return start_time
def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2): """" Invoke a named "correlation" R metric and convert the R dataframe result into a Pandas dataframe. :param r_stream1: an r_stream object :param r_stream2: an r_stream object :param evalresp1: pandas DataFrame of evalresp FAP for r_stream1 :param evalresp2: pandas DataFrame of evalresp FAP for r_stream2 :return: """ R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric') # NOTE: Conversion of dataframes only works if you activate but we don't want conversion # NOTE: to always be automatic so we deactivate() after we're done converting. pandas2ri.activate() r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1) r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2) pandas2ri.deactivate() # TODO: Can we just activate/deactivate before/after R_function() without converting # TODO: r_evalresp1/2 ahead of time? # Calculate the metric r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2) r_dataframe = _R_metricList2DF(r_metriclist) pandas2ri.activate() df = pandas2ri.ri2py_dataframe(r_dataframe) pandas2ri.deactivate() # Convert columns from R POSIXct to pyton UTCDateTime df.starttime = df.starttime.apply(UTCDateTime) df.endtime = df.endtime.apply(UTCDateTime) return df
def getCorrelations(self, dataframe): """ Perform hierarchical clustering on a dataframe of expression values Arguments --------- dataframe: pandas.Core.DataFrame a dataframe containing gene IDs, sample IDs and gene expression values Returns ------- corr_frame: pandas.Core.DataFrame a dataframe of a pair-wise correlation matrix across samples. Uses the Pearson correlation. """ # set sample_id to index pivot = dataframe.pivot(index="sample_name", columns="transcript_id", values="TPM") transpose = pivot.T # why do I have to resort to R???? r_df = py2ri.py2ri_pandasdataframe(transpose) R.assign("p.df", r_df) R("""p.mat <- apply(p.df, 2, as.numeric)""") R("""cor.df <- cor(p.mat)""") r_cor = R["cor.df"] py_cor = py2ri.ri2py_dataframe(r_cor) corr_frame = py_cor return corr_frame
def getCorrelations(self, dataframe): ''' Perform hierarchical clustering on a dataframe of expression values Arguments --------- dataframe: pandas.Core.DataFrame a dataframe containing gene IDs, sample IDs and gene expression values Returns ------- corr_frame: pandas.Core.DataFrame a dataframe of a pair-wise correlation matrix across samples. Uses the Pearson correlation. ''' # set sample_id to index pivot = dataframe.pivot(index="sample_name", columns="transcript_id", values="TPM") transpose = pivot.T # why do I have to resort to R???? r_df = py2ri.py2ri_pandasdataframe(transpose) R.assign("p.df", r_df) R('''p.mat <- apply(p.df, 2, as.numeric)''') R('''cor.df <- cor(p.mat)''') r_cor = R["cor.df"] py_cor = py2ri.ri2py_dataframe(r_cor) corr_frame = py_cor return corr_frame
def pandas_to_dataframe(pd_frame, check_names=False): r_frame = pandas2ri.py2ri_pandasdataframe(pd_frame) r_frame.colnames = pd_frame.columns if not check_names: r_frame.rownames = pd_frame.index return r_frame
def to_trajr(trj): """Convert trajectory to R `trajr` object. Default fps is 30. Args: trajectory (:class:`~traja.main.TrajaDataFrame`): trajectory Returns: traj (:class:`rpy2.robjects.vectors.DataFrame`): column names are ['x', 'y', 'time', 'displacementTime', 'polar', 'displacement'] .. doctest:: >>> import traja >>> df = traja.TrajaDataFrame({'x':range(5),'y':range(5)}) >>> trjr_df = traja.rutils.to_trajr(df) # doctest: +SKIP >>> [x for x in trjr_df.names] # doctest: +SKIP ... ['x', 'y', 'id', 'time', 'displacementTime', 'polar', 'displacement'] """ from traja.trajectory import _get_time_col trajr = import_trajr() if "id" not in trj.__dict__.keys(): trj["id"] = 0 time_col = _get_time_col(trj) if time_col == "index": trj["time"] = trj.index time_col = "time" fps = trj.fps spatial_units = trj.spatial_units or "m" time_units = trj.time_units or "s" trj_rdf = rpandas.py2ri_pandasdataframe(trj) trajr_trj = trajr.TrajFromCoords( trj_rdf, xCol="x", yCol="y", timeCol=time_col or rpy2.rinterface.NULL, fps=fps or 30, spatialUnits=spatial_units, timeUnits=time_units, ) return trajr_trj
def LD_usingR(H0): import rpy2.robjects as robjects from rpy2.robjects import pandas2ri robjects.r('options(warn=-1)') robjects.r('library(genetics)') genotype = H0.applymap(lambda x: ('A', 'G')[x == 0]) genotype = pd.concat( [pd.concat([genotype.iloc[i], genotype.iloc[i + 1]], axis=1).apply(lambda x: '/'.join(x), axis=1) for i in np.arange(0, H0.shape[0], 2)], axis=1).T c = robjects.r['LD'](robjects.r['makeGenotypes'](pandas2ri.py2ri_pandasdataframe(genotype.astype('str')))) c = pd.Series(map(lambda x: pd.DataFrame(pandas2ri.ri2py(x)), c[1:]), index=list(c.names[1:])).apply( lambda x: x.fillna(0)) for x in c: x += x.T x.index = H0.columns; x.columns = H0.columns; c.apply(lambda x: np.fill_diagonal(x.values, None)) return c
def pythonWrapper4Pet(dataframe, snps, covars, trait1, trait2, model1, model2, resamples=999): ''' This is just Python wrapper around the R code for the PET calculations ''' py2ri.activate() E.info("Checking regression models") if model1 == "logistic": R('''trait1.mod <- binomial''') R('''trait1.link <- "logit" ''') elif model1 == "linear": R('''trait1.mod <- gaussian''') R('''trait1.link <- "identity" ''') if model2 == "logistic": R('''trait2.mod <- binomial''') R('''trait2.link <- "logit" ''') elif model2 == "linear": R('''trait2.mod <- gaussian''') R('''trait2.link <- "identity" ''') E.info("Running {} regression for trait 1: {}".format(model1, trait1)) E.info("Running {} regression for trait 2: {}".format(model2, trait2)) R('''source("/ifs/devel/projects/proj045/gwas_pipeline/R_scripts/PET_functions.R")''') E.info("Pushing data objects into the R environment") # push everything into the R environment r_df = py2ri.py2ri_pandasdataframe(dataframe) R.assign("data.df", r_df) r_snps = ro.StrVector([sp for sp in snps]) R.assign("snp.list", r_snps) E.info("Parsing covariates") covars = covars.split(",") r_covar = ro.StrVector([cv for cv in covars]) R.assign("covar.list", r_covar) E.info("{} covariates found to adjust " "in regression models".format(len(covars))) # clean up, replacing "missing values" with NAs for R R('''data.df[data.df == -9] <- NA''') R('''pet_results <- list()''') # loop over all SNP, calculate PCC and p-value # this takes a long time <- need to think of speed ups # possible Python-pure implementation, i.e. with LIMIX? E.info("Iteratively calculating PCC for all SNPs") R('''results <- loopPET(data.df=data.df, trait1="%(trait1)s", trait2="%(trait2)s", ''' '''trait1.link=trait1.link, trait2.link=trait2.link, ''' '''trait1.mod=trait1.mod, trait2.mod=trait2.mod, covars=covar.list,''' '''resamples=%(resamples)i, snp.list=snp.list)''' % locals()) R('''out.res <- data.frame(do.call(rbind, results))''') R('''colnames(out.res) <- c("PCC", "pvalue")''') py_out = py2ri.ri2py_dataframe(R["out.res"]) return py_out
def pythonWrapper4Pet(dataframe, snps, covars, trait1, trait2, model1, scriptsdir, model2, resamples=999): ''' This is just Python wrapper around the R code for the PET calculations ''' py2ri.activate() E.info("Checking regression models") if model1 == "logistic": R('''trait1.mod <- binomial''') R('''trait1.link <- "logit" ''') elif model1 == "linear": R('''trait1.mod <- gaussian''') R('''trait1.link <- "identity" ''') if model2 == "logistic": R('''trait2.mod <- binomial''') R('''trait2.link <- "logit" ''') elif model2 == "linear": R('''trait2.mod <- gaussian''') R('''trait2.link <- "identity" ''') E.info("Running {} regression for trait 1: {}".format(model1, trait1)) E.info("Running {} regression for trait 2: {}".format(model2, trait2)) R('''source("%(scriptsdir)s/PET_functions.R")''' % locals()) E.info("Pushing data objects into the R environment") # push everything into the R environment r_df = py2ri.py2ri_pandasdataframe(dataframe) R.assign("data.df", r_df) r_snps = ro.StrVector([sp for sp in snps]) R.assign("snp.list", r_snps) E.info("Parsing covariates") covars = covars.split(",") r_covar = ro.StrVector([cv for cv in covars]) R.assign("covar.list", r_covar) E.info("{} covariates found to adjust " "in regression models".format(len(covars))) # clean up, replacing "missing values" with NAs for R R('''data.df[data.df == -9] <- NA''') R('''pet_results <- list()''') # loop over all SNP, calculate PCC and p-value # this takes a long time <- need to think of speed ups # possible Python-pure implementation, i.e. with LIMIX? E.info("Iteratively calculating PCC for all SNPs") R('''results <- loopPET(data.df=data.df, trait1="%(trait1)s", trait2="%(trait2)s", ''' '''trait1.link=trait1.link, trait2.link=trait2.link, ''' '''trait1.mod=trait1.mod, trait2.mod=trait2.mod, covars=covar.list,''' '''resamples=%(resamples)i, snp.list=snp.list)''' % locals()) R('''out.res <- data.frame(do.call(rbind, results))''') R('''colnames(out.res) <- c("PCC", "pvalue")''') py_out = py2ri.ri2py_dataframe(R["out.res"]) return py_out
def R_var_importance(nsamples=40000, data_store=None): base = importr('base') ################################################### # load dataframe store = pd.HDFStore(data_store) print(store) #pandas2ri.activate() Xtrain = store['Xtrain'] ytrain = store['ytrain'] #Xtest, Xtrain, ytrain, Xval, yval, test_idx, val_idx = prepareAllFeatures() #sample if nsamples != -1: if isinstance(nsamples, str) and 'shuffle' in nsamples: print("Shuffle train data...") rows = np.random.choice(len(Xtrain.index), size=len(Xtrain.index), replace=False) else: rows = np.random.choice(len(Xtrain.index), size=nsamples, replace=False) print("unique rows: %6.2f" % (float(np.unique(rows).shape[0]) / float(rows.shape[0]))) Xtrain = Xtrain.iloc[rows, :] ytrain = ytrain.iloc[rows] store.close() pandas2ri.activate() print(Xtrain.info()) print(Xtrain.describe(include='all')) Xtrain_R = pandas2ri.py2ri_pandasdataframe(Xtrain) ytrain_R = pandas2ri.py2ri_pandasseries(ytrain) #print Xtrain_R ################################################### # R-code # http://stackoverflow.com/questions/27801409/get-field-values-from-rpy2-random-forest-object r = robjects.r r['options'](warn=-1) r.library('randomForest') rf = r.randomForest(Xtrain_R, ytrain_R, ntree=250, importance=True, do_trace=1) df_imp_R = rf.rx("importance") df_imp_R = base.as_data_frame(df_imp_R) df_imp = pandas2ri.ri2py(df_imp_R) df_imp = df_imp.sort(columns=['importance.IncNodePurity'], ascending=False) print(df_imp) with pd.option_context('display.max_rows', 999, 'display.max_columns', 3): print(list(df_imp.index)) #print r.dimnames(rf[8]) r.varImpPlot(rf, sort=True, n_var=30)
def testColoc(trait1, trait2, trait1_type, trait2_type, maf_table, gene_list=None, trait1_prev=None, trait2_prev=None, chromosome=None, start=None, end=None): ''' Perform colocalization testing between two traits. Arguments ------ trait1: pandas.core.dataframe A data frame containing the summary statistics for trait 1 trait2: pandas.core.dataframe A data frame containing the summary statistics for trait 2 trait1_type: string Either `cc` or `quant`, denoting the type of trait 1 trait2_type: string Either `cc` or `quant`, denoting the type of trait 2 maf_table: pandas.core.dataframe Data frame containing SNP IDs and MAF gene_list: list A list of genes to restirct analysis to. Either trait 1 or trait 2 must be a quantitative trait trait1_prev: float Prevalence of trait1 if binary trait2_prev: float Prevalence of trait2 if binary chromosome: int Chromosome to restrict the colocalisation analysis to start: int start co-ordinate to restrict analysis to. Must also provide `chromosome`. 1-based index, closed [start, end] end: int end co-ordinate to restrict analysis to. Must also provide `chromosome` and `start`. 1-based index, closed [start, end] Returns ------- coloc_results: pandas.core.dataframe A data frame containing each region (e.g. genes) and the posterior probability in favour of each hypothesis: H0 - no association with trait1 or trait2, and no colocalisation H1 - association with trait 1, but no colocalisation H2 - association with trait2, but no colocalisation H3 - association with trait1 and 2, but no colocalisation H4 - association with trait1 and 2, and colocalised ''' # push all elements into the R environment R('''sink(file="sink.text")''') R('''suppressPackageStartupMessages(library(coloc))''') R('''source("/ifs/devel/projects/proj045/gwas_pipeline/R_scripts/coloQtl.R")''') E.info("Pushing results tables into R environment") py2ri.activate() r_trait1 = py2ri.py2ri_pandasdataframe(trait1) R.assign("r.trait1", r_trait1) r_trait2 = py2ri.py2ri_pandasdataframe(trait2) R.assign("r.trait2", r_trait2) r_maf = py2ri.py2ri_pandasdataframe(maf_table) R.assign("r.mafs", r_maf) if trait1_prev: R.assign("trait1.prev", trait1_prev) else: R('''trait1.prev <- NULL''') if trait2_prev: R.assign("trait2.prev", trait2_prev) else: R('''trait2.prev <- NULL''') E.info("Checking for gene list") if gene_list: E.info("Gene list contains {} genes".format(len(set(gene_list)))) r_genes = ro.StrVector([rx for rx in set(gene_list)]) R.assign("gene.list", r_genes) E.info("Iterating over gene list") R('''res.df <- geneListSnpColocQtl(gene_list=gene.list,''' '''results_table=r.trait1, MAF_table=r.mafs, ''' '''eqtl_table=r.trait2, trait_type="%(trait1_type)s", ''' '''prev=trait1.prev)''' % locals()) R('''genes <- rownames(res.df)''') genes = [gx for gx in R["genes"]] else: R('''res.df <- TwoTraitSnpColocQtl(trait1_table=r.trait1,''' '''trait2_table=r.trait2, MAF_table=r.mafs, ''' '''trait1_type="%(trait1_type)s", trait2_type="%(trait2_type)s",''' '''prev1=trait1.prev, prev2=trait2.prev)''') R('''genes <- dim(res.df)[1]''') genes = R["genes"] coloc_results = py2ri.ri2py_dataframe(R["res.df"]) coloc_results.index = genes coloc_results.columns = ["nSNPs", "H0.PP", "H1.PP", "H2.PP", "H3.PP", "H4.PP"] R('''sink(file=NULL)''') return coloc_results
def pandas_df_to_exprset( phenoData, featureData, dfs: pd.DataFrame or [pd.DataFrame]) -> pd.DataFrame or list: # TODO: !!!IMPORTANT!!! methods.new("ExpressionSet", exprs=base.as_matrix(df)) to make an exprset from df # object.r_repr() to print representation # phenoData = annotatedDF, featureData = AnnotatedDF, assayData=expresion matrix (type matrix or AssayData) """A function to convert a pandas DataFrame to a Biobase ExpressionSet Takes a list of pandas DataFrames or a single pandas DataFrame and converts it to (source: https://bioconductor.org/packages/release/bioc/html/Biobase.html) References: https://rpy2.readthedocs.io/en/version_2.7.x/generated_rst/s4class.html https://rpy2.readthedocs.io/en/version_2.7.x/_static/notebooks/s4class.html https://rpy2.readthedocs.io/en/version_2.8.x/robjects_oop.html Args: dfs: pd.DataFrame(s) to convert to Biobase.ExpressionSet (single dataframe or a list) Returns: a Biobase.ExpressionSet object or a list of Biobase.ExpressionSet objects """ # attributes can be accessed using .slots rpd.activate() data = importr('data.table') base = importr('base') ballgown = importr('ballgown') # dfs.dropna(inplace=True) if isinstance(dfs, list): exprset = list() for i in dfs: data = [dfs[col].values.tolist() for col in dfs] cols = [col for col in dfs] exprset.append( methods.new( 'ExpressionSet', exprs=biobase.assayData(rpd.py2ri_pandasdataframe(data)), phenoData=biobase.AnnotatedDataFrame(phenoData), featureData=biobase.AnnotatedDataFrame(featureData))) else: # data = [dfs[col].values.tolist() for col in dfs] rphenoData = biobase.AnnotatedDataFrame( rpd.py2ri_pandasdataframe(phenoData)) rfeatureData = biobase.AnnotatedDataFrame( rpd.py2ri_pandasdataframe(featureData)) rdata = data.na_omit_data_table(rpd.py2ri_pandasdataframe(dfs)) # used to be data.as_matrix exprs = base.as_matrix((rdata)) # rdata = base.unname(rdata) # print(rdata) exprset = biobase.ExpressionSet() exprset.slots['assayData<-'] = exprs exprset.slots['phenoData'] = rphenoData exprset.slots['featureData'] = rfeatureData # print(ballgown.expr(exprset)) # rpd.deactivate() return exprset
def deseqAnalysis(counts_table, design, reference, outfile): ''' Perform differential expression analysis using DESeq2 ''' design_df = pd.read_table(design, sep="\t", header=0, index_col=0) counts_df = pd.read_table(counts_table, sep="\t", header=0, index_col=0, compression="gzip") E.info("setting up counts table") py2ri.activate() r_design = py2ri.py2ri_pandasdataframe(design_df) r_counts = py2ri.py2ri_pandasdataframe(counts_df) R.assign("design", r_design) R.assign("counts", r_counts) R('''sink(file="/dev/null")''') E.info("loading required R packages") R('''suppressPackageStartupMessages(library(DESeq2))''') R('''suppressPackageStartupMessages(library(gplots))''') R('''suppressPackageStartupMessages(library(RColorBrewer))''') R('''suppressPackageStartupMessages(library(ggplot2))''') R('''notZero <- counts[rowMeans(counts) > 1,]''') R('''dds <- DESeqDataSetFromMatrix(countData=notZero,''' '''colData=design, design=~group)''') E.info("performing differential expression testing") R('''de <- DESeq(dds, fitType="parametric")''') R('''res <- results(de)''') E.info("generating MA plot") # generate MAplots R('''png("images.dir/%s-MAplot.png", height=480, width=480)''' % reference) R('''plotMA(res, alpha=0.05)''') R('''dev.off()''') E.info("performing variance stabilising transformation") R('''vst <- data.frame(getVarianceStabilizedData(de))''') E.info("clustering samples and plotting heatmap") R('''cors <- cor(vst)''') R('''hmcol <- colorRampPalette(brewer.pal(9, "PuOr"))(100)''') R('''png("images.dir/%s-sample_clustering-heatmap.png", height=480, ''' '''width=480)''' % reference) R('''heatmap.2(as.matrix(cors), col=hmcol, trace="none", ''' '''breaks=seq(0, 1, 0.01), margins=c(10,10), cexRow=0.8,''' '''cexCol=0.8)''') R('''dev.off()''') E.info("performing principal components analysis") R('''pca <- prcomp(data.frame(t(vst)), scale=T, centre=T)''') R('''pcs <- data.frame(pca$x)''') R('''pcs$condition <- as.factor(design$group)''') R('''p_pca <- ggplot(pcs, aes(x=PC1, y=PC2, colour=condition)) + ''' '''geom_point(size=6)''') R('''png("images.dir/%s-PCA_pc1-pc2.png", height=480, ''' '''width=480)''' % reference) R('''print(p_pca)''') R('''dev.off()''') E.info("writing table of results") R('''res.df <- data.frame(res)''') ('''sink(file=NULL)''') out_df = com.load_data("res.df") out_df.to_csv(outfile, sep="\t", index_label="gene_id")