def fit(self, indep_vars, dep_var): ro.globalenv['train'] = pandas2ri.py2ri(indep_vars) ro.globalenv[dep_var.name] = pandas2ri.py2ri(dep_var) # Builds the parameters string param = utils.build_R_parameters(self.param) # In order to support neural networks from different packages it # was necessary to wrap their respective methods for the "fit" concept if self.param.get("algorithm") == "rprop+": formula = dep_var.name+"~"+"+".join(indep_vars.columns.tolist()) ro.r("formula <- as.formula(%s)" % formula) return self.RReturn( ro.r("neuralnet(formula, data=train,%s)" % param), self.param.get("algorithm") ) elif self.param.get("method") == "ADAPTgdwm": ro.r("fit <- newff(%s)" % param) return self.RReturn( ro.r( "fit <- train(fit, train, %s, error.criterium='LMS',\ report=TRUE, show.step=1000, n.shows=100)" % dep_var.name ), self.param.get("method") )
def predict(self,newX): if self.model is None: print('model must first be fitted') return None if not isinstance(newX, pandas.DataFrame): newX=pandas.DataFrame(newX,columns=['V%d'%i for i in range(newX.shape[1])]) if self.modeltype=='poisson': robjects.globalenv['newX']=pandas2ri.py2ri(newX) robjects.r('newX=data.matrix(newX)') if self.lambda_preset is not None: # heuristic for whether we are using zipath() robjects.r('pred=predict(fit,newX)') pred=robjects.r('pred').squeeze() else: pred=mpath.predict_glmreg(self.model[self.model.names.index('fit')], base.as_symbol('newX'), which=self.lambda_which) elif self.modeltype=='ZINB' or self.modeltype=='ZIpoisson' : robjects.globalenv['newX']=pandas2ri.py2ri(newX) #robjects.r('newX=data.matrix(newX)') if self.lambda_preset is not None: # heuristic for whether we are using zipath() robjects.r('pred=predict(fit,newX)') else: robjects.r('pred=predict(fit$fit,newX,which=fit$lambda.which)') pred=robjects.r('pred').squeeze() return numpy.array(pred)
def plotPairwiseCorrelations(self, outfile, subset=False): ''' use the R base pairs function to plot all pairwise correlations between the samples subset will randomly subset n rows to speed up plotting''' plotGGpairs = R(''' function(df){ write.table(df, file="%(outfile)s.tsv", sep="\t") colnames(df) <- gsub("-", "_", colnames(df)) width <- height <- length(colnames(df)) * 100 png("%(outfile)s", width=width, height=height, units = "px") panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){ usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits = digits)[1] txt <- paste0(prefix, txt) if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor * r * 50)} panel.hist = function (x, ...) { par(new = TRUE) hist(x, breaks=30, col = "light blue", probability = TRUE, axes = FALSE, main = "") rug(x)} pairs(df, pch=20, cex=0.1, lower.panel = panel.smooth, upper.panel = panel.cor, diag.panel=panel.hist) dev.off() }''' % locals()) if subset: if len(self.table.index) > subset: rows = random.sample(self.table.index, subset) r_counts = pandas2ri.py2ri(self.table.ix[rows]) else: r_counts = pandas2ri.py2ri(self.table) else: r_counts = pandas2ri.py2ri(self.table) plotGGpairs(r_counts)
def SCCA_r(X,Y, n_components, pen): df_X = pd.DataFrame(X) df_Y = pd.DataFrame(Y) rmat_X = pandas2ri.py2ri(df_X) rmat_Y = pandas2ri.py2ri(df_Y) ri.globalenv['X'] = rmat_X ri.globalenv['Y'] = rmat_Y out = PMA.CCA(x=X, z=Y, K=n_components, niter =100, standardize=False, penaltyx=pen[0], penaltyz=pen[1]) df_u = pandas2ri.ri2py(out[1]) df_v = pandas2ri.ri2py(out[2]) cors = pandas2ri.ri2py(out[15]) loadings = (np.asmatrix(df_u), np.asmatrix(df_v)) return loadings, cors
def RDCC(data): cols = data.columns data = pandas2ri.py2ri(data) rpy2.robjects.globalenv["data"] = data armaspec = (1, 1, 1, 1, 1, 1) rscript = """ suppressMessages(library(rugarch)) suppressMessages(library(rmgarch)) ###data <- matrix(rnorm(2200),200,11) spec <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(%s, %s),submodel = NULL, external.regressors = NULL, variance.targeting = FALSE), mean.model = list(armaOrder = c(%s, %s), external.regressors = NULL, distribution.model = "norm", start.pars = list(), fixed.pars = list())) dccspec<-dccspec(uspec=multispec(replicate(11,spec)),dccOrder = c(%s,%s),distribution="mvnorm") dccgarch<-dccfit(dccspec,data = data) dccsimdata<-dccsim(dccgarch,n.sim=1000) dccgarch fitted(dccsimdata) """ % ( armaspec[0], armaspec[1], armaspec[2], armaspec[3], armaspec[4], armaspec[5], ) print rscript b = r(rscript) b = pd.DataFrame(b) b.plot() # plt.show() return b
def computeMnnBatchCorrection(counts): """Computes batch correction to a list of batches (data frames) where each data frame represents a batch (animal for instance). The batch correction is computed using Scran::mnnCorrect() from Marioni et al. :param counts: a list of matrices of counts :return returns a list of batch corrected matrices of counts """ pandas2ri.activate() as_matrix = r["as.matrix"] meta = [(x.index,x.columns) for x in counts] r_counts = [as_matrix(pandas2ri.py2ri(x)) for x in counts] RimportLibrary("scran") r_call = """ function(counts) { norm_counts = do.call(mnnCorrect, c(counts, cos.norm.out=FALSE)); return(lapply(norm_counts$corrected, as.data.frame)) } """ r_func = r(r_call) norm_counts = list() for i,x in enumerate(r_func(r_counts)): norm_c = pandas2ri.ri2py(x) norm_c.index = meta[i][0] norm_c.columns = meta[i][1] norm_counts.append(norm_c) pandas2ri.deactivate() return norm_counts
def computeSumFactors(counts, scran_clusters=True): """ Compute normalization factors using the deconvolution method described in Marioni et al. Returns the computed size factors as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ n_cells = len(counts.columns) pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) scran = RimportLibrary("scran") as_matrix = r["as.matrix"] if scran_clusters and n_cells >= 50: r_clusters = scran.quickCluster(as_matrix(r_counts), min(n_cells/10, 10), method="igraph") min_cluster_size = min(Counter(r_clusters).values()) sizes = list(range(min(int(min_cluster_size/4), 10), min(int(min_cluster_size/2), 50), 5)) dds = scran.computeSumFactors(as_matrix(r_counts), clusters=r_clusters, sizes=sizes) else: sizes = list(range(min(int(n_cells/4), 10), min(int(n_cells/2), 50), 5)) dds = scran.computeSumFactors(as_matrix(r_counts), sizes=sizes) pandas_sf = pandas2ri.ri2py(dds) pandas2ri.deactivate() return pandas_sf
def RCopula(data, sim): cols = data.columns data2 = pandas2ri.py2ri(data) data = np.array(data) rpy2.robjects.globalenv["data"] = data2 rpy2.robjects.globalenv["N"] = N rscript = """ suppressMessages(library(copula)) nAssets <- ncol(data) u <- pobs(data,N) clayton.cop <- claytonCopula(2,dim=nAssets) a <- fitCopula(clayton.cop,u,method="mpl") y <- (rCopula(copula=claytonCopula(a@estimate,nAssets),n=N)) y""" print rscript b = r(rscript) ###########???????????? for j in range(b.shape[1]): mean = np.mean(data[:, j]) std = np.std(data[:, j]) for i in range(b.shape[0]): b[i, j] = scipy.stats.norm.ppf(b[i, j], loc=mean, scale=std) pd.DataFrame(b).iloc[:, 1].plot() plt.show() exit() b = pd.DataFrame(b) return b
def heatmap(self, plotfile): ''' plots a heatmap ''' # to do: add option to parse design file and add coloured row for # variable specified in design file. plotHeatmap = R(''' function(df){ library("Biobase") library("RColorBrewer") library("gplots") hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100) png("%(plotfile)s", width=1000, height=1000, units="px") write.table(df, file="%(plotfile)s.tsv", sep="\t") heatmap.2(as.matrix(df), col = hmcol, scale="none", trace="none", margin=c(18, 10), dendrogram="column", cexCol=2, labRow = "", hclustfun = function(x) hclust(x, method = 'average'), distfun = function(x) as.dist(1 - cor(t(x), method="spearman"))) dev.off() }''' % locals()) r_counts = pandas2ri.py2ri(self.table) plotHeatmap(r_counts)
def RCopulaGarch(data, sim): data = pandas2ri.py2ri(data) rpy2.robjects.globalenv["data"] = data rpy2.robjects.globalenv["simulations"] = sim rscript = """ suppressMessages(library(rugarch)) suppressMessages(library(rmgarch)) data <- matrix(rnorm(2200),200,11) nassets <- ncol(data) nperiods <- 390 simulations <- 5000 spec <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 1),submodel = NULL,external.regressors = NULL, variance.targeting = FALSE),mean.model = list(armaOrder = c(1, 1), external.regressors = NULL,distribution.model = "norm", start.pars = list(), fixed.pars = list())) dccspec<-dccspec(uspec=multispec(replicate(ncol(data),spec)),dccOrder = c(1,1),distribution="mvnorm") mspec <-multispec(replicate(ncol(data),spec)) cspec<-cgarchspec(mspec, VAR = FALSE, robust = FALSE, lag = 1, lag.max = NULL,lag.criterion = "AIC", external.regressors = NULL,robust.control = list(gamma = 0.25, delta = 0.01, nc = 10, ns = 500),dccOrder = c(1, 1), asymmetric = FALSE,distribution.model = list(copula = "mvt",method = "Kendall", time.varying = FALSE,transformation = "parametric"),start.pars = list(), fixed.pars = list()) copgarch <- cgarchfit(cspec, data, spd.control = list(lower = 0.1, upper = 0.9, type = "pwm",kernel = "epanech"), fit.control = list(eval.se = TRUE, stationarity = TRUE,scale = FALSE), solver = "solnp", solver.control = list(), out.sample = 0,cluster = NULL, fit = NULL, VAR.fit = NULL, realizedVol = NULL) simfit <- cgarchsim(copgarch, n.sim = nperiods, n.start = 0, m.sim = simulations,startMethod = "sample", presigma = NULL, preresiduals = NULL,prereturns = NULL, preR = NULL, preQ = NULL, preZ = NULL, rseed = NULL,mexsimdata = NULL, vexsimdata = NULL, cluster = NULL, only.density = FALSE,prerealized = NULL) simdata <- fitted(simfit) t <- array(rep(nperiods*nassets*simulations),c(nperiods,nassets,simulations)) for (i in 1:simulations) { t[,,i] <- fitted(simfit,i) } t """ print rscript b = r(rscript) return b
def mca( distance_matrix, dim = 2 ): """ calculate MCA matrix using R's FactorMineR """ # build up haplotype dataframe from fatools.lib.utils import acquire_R, release_R from rpy2 import robjects from rpy2.robjects import pandas2ri acquire_R() r_df = pandas2ri.py2ri(distance_matrix.H) robjects.globalenv['haplo_data'] = r_df marker_len = len(distance_matrix.H.columns) arguments = ','.join('as.factor(haplo_data[,%d])' % x for x in range(1, marker_len + 1)) robjects.r('haplo_df <- data.frame(%s)' % arguments) robjects.r('library(FactoMineR)') mca_res = robjects.r('MCA(haplo_df, graph=FALSE)') # get the individual coordinate coord = pandas2ri.ri2py(mca_res.rx('ind')[0].rx('coord')[0]) release_R() return (coord, None)
def extract_typ_real_curve(df, discarded_seasons=None, wdw_method=2, lower_bound=5.0): seasons = sorted(list(df.columns.drop(['UF', 'epiweek'])))[:-1] seasons = sorted(set(seasons).difference(discarded_seasons)) rdf = pandas2ri.py2ri(df) rseasons = ro.StrVector(seasons) ro.globalenv['df'] = rdf ro.globalenv['seasons'] = rseasons ro.globalenv['par.method'] = wdw_method ro.globalenv['par.type.curve'] = 2 ro.globalenv['par.level.curve'] = 0.95 epimemrslt = ro.r('t(apply(subset(df, select=seasons), 1, memci, i.type.curve=par.type.curve, ' + 'i.level.curve=par.level.curve))') # Pre-epidemic threshold: typrealcurve = pd.DataFrame(epimemrslt) # Store results in python dictionary of objects pyepimemrslt = {} # typ.real.curve is the typical curve without time shift, that is, respecting the original weeks from data # this curve is better to keep all seasons, not only the epidemic ones. pyepimemrslt['typ.real.curve'] = typrealcurve.copy() pyepimemrslt['typ.real.curve'].rename(columns={0: 'baixo', 1: 'mediano', 2: 'alto'}, inplace=True) pyepimemrslt['typ.real.curve']['mediano'].fillna(0, inplace=True) pyepimemrslt['typ.real.curve'].loc[pyepimemrslt['typ.real.curve']['baixo'] < 0, 'baixo'] = 0 pyepimemrslt['typ.real.curve']['baixo'] = pyepimemrslt['typ.real.curve']['baixo']. \ where((-pyepimemrslt['typ.real.curve']['baixo'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano']) pyepimemrslt['typ.real.curve']['alto'] = pyepimemrslt['typ.real.curve']['alto']. \ where((-pyepimemrslt['typ.real.curve']['alto'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano']) return pyepimemrslt
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False): """Makes a call to DESeq2 with SCRAN to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Returns a list of DESeq2 results for each comparison """ results = list() n_cells = len(counts.columns) try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.StrVector(conds) r_call = """ function(r_counts) { sce = SingleCellExperiment(assays=list(counts=r_counts)) return(sce) } """ r_func = r(r_call) sce = r_func(as_matrix(r_counts)) if scran_clusters: r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10)) min_cluster_size = min(Counter(r_clusters).values()) sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]])) sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True) else: sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]])) sce = scran.computeSumFactors(sce, sizes=sizes, positive=True) sce = r.normalize(sce) dds = r.convertTo(sce, type="DESeq2") r_call = """ function(dds, conditions){ colData(dds)$conditions = as.factor(conditions) design(dds) = formula(~ conditions) return(dds) } """ r_func = r(r_call) dds = r_func(dds, cond) dds = r.DESeq(dds) # Perform the comparisons and store results in list for A,B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def logCountsWithFactors(counts, size_factors): """ Uses the R package scater to log a matrix of counts (genes as rows) and a vector of size factor using the method normalize(). :param counts: a matrix of counts (genes as rows) :param size_factors: a vector of size factors :return the normalized log counts (genes as rows) """ columns = counts.columns indexes = counts.index pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) scater = RimportLibrary("scran") r_call = """ function(counts, size_factors){ sce = SingleCellExperiment(assays=list(counts=as.matrix(counts))) sizeFactors(sce) = size_factors sce = normalize(sce) norm_counts = logcounts(sce) return(as.data.frame(norm_counts)) } """ r_func = r(r_call) r_norm_counts = r_func(r_counts, size_factors) pandas_norm_counts = pandas2ri.ri2py(r_norm_counts) pandas_norm_counts.index = indexes pandas_norm_counts.columns = columns pandas2ri.deactivate() return pandas_norm_counts
def predict(self, xtest): """Predicts class via majority vote. Parameters ---------- xtest : pd.DataFrame features for test set """ if new_pandas_flag: r_xtest = pandas2ri.py2ri(xtest) else: r_xtest = com.convert_to_r_dataframe(xtest) #r_xtest = pandas2ri.py2ri(xtest) pred = self.rf_pred(self.rf, r_xtest) if new_pandas_flag: #py_pred = pandas2ri.ri2py(pred) tmp_genes = pred[1] tmp_pred_class = pred[0] genes = pandas2ri.ri2py(tmp_genes) pred_class = pandas2ri.ri2py(tmp_pred_class) else: py_pred = com.convert_robj(pred) genes, pred_class = zip(*py_pred.items()) #genes = com.convert_robj(tmp_genes) #pred_class = com.convert_robj(tmp_pred_class) tmp_df = pd.DataFrame({'pred_class': pred_class}, index=genes) tmp_df = tmp_df.reindex(xtest.index) tmp_df -= 1 # for some reason the class numbers start at 1 return tmp_df['pred_class']
def set_cv_fold(self, df): """Send which genes are valid test sets for each CV fold.""" if new_pandas_flag: r_df = pandas2ri.py2ri(df) else: r_df = com.convert_to_r_dataframe(df) ro.globalenv['cvFoldDf'] = r_df
def conditionDESeq2(data_frame, header, alpha, res_dir): ''' Perform DESeq2-based analysis of condition:time interaction dependent differential expression ''' E.info("Differential expression testing for %s" % header) cols = data_frame.columns # py2ri requires activation pandas2ri.activate() counts = pandas2ri.py2ri(data_frame) des_times = ro.IntVector([x.split(".")[1] for x in cols]) des_reps = ro.StrVector([x.split(".")[2] for x in cols]) des_cond = ro.StrVector([x.split(".")[0] for x in cols]) genes = ro.StrVector([x for x in data_frame.index]) # setup counts table and design frame R('''suppressPackageStartupMessages(library("DESeq2"))''') R('''sink(file="/dev/null")''') R('''times <- as.factor(%s)''' % des_times.r_repr()) R('''reps <- c(%s)''' % des_reps.r_repr()) R('''condition <- c(%s)''' % des_cond.r_repr()) R('''design <- data.frame(times, reps, condition)''') R('''counts <- data.frame(%s)''' % counts.r_repr()) R('''genes <- c(%s)''' % genes.r_repr()) R('''rownames(counts) <- genes''') R('''rownames(design) <- colnames(counts)''') # use DESeq() with LRT and reduced formula. Use effect # size moderation R('''dds <- DESeqDataSetFromMatrix(countData=counts, ''' '''colData=design, ''' '''design=~reps + times + condition + times:condition)''') R('''dds <- DESeq(dds, test="LRT", ''' '''reduced=~reps + times + condition, betaPrior=T)''') R('''res <- results(dds)[order(results(dds)$padj, na.last=T), ]''') R('''res.df <- data.frame(res)''') # generate dispersion and MA plots R('''png("%s/%s-dispersions.png")''' % (res_dir, header)) R('''plotDispEsts(dds)''') R('''dev.off()''') R('''png("%s/%s-MAplot.png")''' % (res_dir, header)) R('''plotMA(res, alpha=%0.3f, ylim=c(-5,5))''' % alpha) R('''dev.off()''') R('''sink(file=NULL)''') df = pandas2ri.ri2py(R['res.df']) return df
def treeCutting(infile, expression_file, cluster_file, cluster_algorithm, deepsplit=False): ''' Use dynamic tree cutting to derive clusters for each resampled distance matrix ''' wgcna_out = "/dev/null" E.info("loading distance matrix") df = pd.read_table(infile, sep="\t", header=0, index_col=0) df = df.fillna(0.0) genes = df.index genes_r = ro.StrVector([g for g in genes]) # py2ri requires activation pandas2ri.activate() rdf = pandas2ri.py2ri(df) R.assign("distance_data", rdf) R.assign("gene_ids", genes_r) R('''sink(file='%(wgcna_out)s')''' % locals()) R('''suppressPackageStartupMessages(library("WGCNA"))''') R('''suppressPackageStartupMessages(library("flashClust"))''') E.info("clustering data by %s linkage" % cluster_algorithm) R('''rownames(distance_data) <- gene_ids''') R('''clustering <- flashClust(as.dist(distance_data),''' ''' method='%(cluster_algorithm)s')''' % locals()) if deepsplit: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''minClusterSize=50, deepSplit=T)''') else: R('''cluster_cut <- cutreeDynamic(dendro=clustering, ''' '''minClusterSize=50, deepSplit=F)''') R('''color_cut <- labels2colors(cluster_cut)''') R('''write.table(color_cut, file = '%(cluster_file)s',''' '''sep="\t")''' % locals()) R('''cluster_matched <- data.frame(cbind(rownames(distance_data),''' '''color_cut))''') R('''colnames(cluster_matched) = c("gene_id", "cluster")''') R('''cluster_matched <- data.frame(cluster_matched$gene_id,''' '''cluster_matched$cluster)''') R('''sink(file=NULL)''') cluster_frame = pandas2ri.ri2py(R["cluster_matched"]) cluster_frame.columns = ['gene_id', 'cluster'] cluster_frame.index = cluster_frame['gene_id'] cluster_frame.drop(['gene_id'], inplace=True, axis=1) return cluster_frame
def predict(self, indep_vars): ro.globalenv['test'] = pandas2ri.py2ri(indep_vars) ro.globalenv['fit'] = self.fitted_model if self.algorithm == "rprop+": return pandas2ri.ri2py( ro.r("compute(fit,test)$net.result") ) elif self.algorithm == "ADAPTgdwm": return pandas2ri.ri2py( ro.r("sim(fit$net, test)") )
def computeNClusters(counts, min_size=20): """Computes the number of clusters from the data using Scran::quickCluster""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts.transpose()) scran = RimportLibrary("scran") as_matrix = r["as.matrix"] clusters = scran.quickCluster(as_matrix(r_counts), min_size, method="igraph") n_clust = len(set(clusters)) pandas2ri.deactivate() return n_clust
def fit(self, relationship, df): """ relationship: string of the form: a~b+c df: Pandas Dataframe """ # Get R dataframe r_df = pandas2ri.py2ri(df) # Create linear fit fit = stats.lm(relationship, data=df) self.fit = fit python_fit = self.convert_fit_to_python(fit) return python_fit
def draw_survival_curves_mpl(fit, ax=None, title=None, colors=None, ms=80, alpha=1): """ Takes an R survfit. """ if ax is None: _, ax = plt.subplots(1, 1, figsize=(4, 3)) s = base.summary(fit) tab = pd.DataFrame({v: s.rx2(v) for v in s.names if len(s.rx2(v)) == len(s.rx2('time'))}, index=s.rx2('time')) call = pandas2ri.py2ri(fit.rx2('call')[2]) groups = robjects.r.sort(robjects.r.c(*call.feature.unique())) if 'strata' not in tab: groups = [0] tab['strata'] = 1 elif len(tab.strata.unique()) != len(groups): gg = list(call[call.event > 0].feature.unique()) gg = [g for g in groups if g in gg] bg = [g for g in groups if g not in gg] groups = gg + bg for i, group in enumerate(groups): censoring = call[(call.event == 0) & (call.feature == group)].days surv = tab[tab.strata == (i + 1)].surv surv = surv.copy().set_value(0., 1.) surv = surv.sort_index() if surv.index[-1] < censoring.max(): surv = surv.set_value(censoring.max(), surv.iget(-1)).sort_index() censoring_pos = get_markers(censoring, surv) ax.step(surv.index, surv, lw=3, where='post', alpha=alpha, label=group) if colors is not None: try: """fix for R-Python str-to-int conversion""" color = colors[group] except: color = colors[i] ax.lines[-1].set_color(color) if len(censoring_pos) > 0: ax.scatter(*zip(*censoring_pos), marker='|', s=ms, color=ax.lines[-1].get_color()) ax.set_ylim(0, 1.05) # ax.set_xlim(0, max(surv.index)*1.05) ax.set_xlim(0, max(call.days) * 1.05) ax.legend(loc='best') ax.set_ylabel('Survival') ax.set_xlabel('Years') if title: ax.set_title(title)
def computeNClusters(counts, min_size=20): """Computes the number of clusters from the data using Scran::quickCluster""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts.transpose()) scran = RimportLibrary("scran") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] clusters = scran.quickCluster(as_matrix(r_counts), min_size) n_clust = len(set(clusters)) pandas2ri.deactivate() return n_clust
def transform(self, method="vst", inplace=True): ''' perform transformation on counts table current methods are: - deseq2 variance stabalising transformation - deseq rlog transformation ''' assert method in ["vst", "rlog"], ("method must be one of" "[vst, rlog]") method2function = {"vst": "varianceStabilizingTransformation", "rlog": "rlog"} t_function = method2function[method] transform = R(''' function(df){ suppressMessages(library('DESeq2')) design = data.frame(row.names = colnames(df), condition = seq(1, length(colnames(df)))) dds <- suppressMessages(DESeqDataSetFromMatrix( countData= df, colData = design, design = ~condition)) transformed <- suppressMessages(%(t_function)s(dds)) transformed_df <- as.data.frame(assay(transformed)) return(transformed_df) }''' % locals()) r_counts = pandas2ri.py2ri(self.table) df = pandas2ri.ri2py(transform(r_counts)) # losing rownames for some reason during the conversion?! df.index = self.table.index if inplace: self.table = df # R replaces "-" in column names with ".". Revert back! self.table.columns = [x.replace(".", "-") for x in self.table.columns] else: tmp_counts = self.clone() tmp_counts.table = df tmp_counts.table.columns = [x.replace(".", "-") for x in tmp_counts.table.columns] return tmp_counts
def computeRLEFactors(counts): """ Compute normalization size factors using the RLE method described in EdgeR and returns then as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) edger = RimportLibrary("edgeR") as_matrix = r["as.matrix"] dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE") pandas_sf = pandas2ri.ri2py(dds) pandas_cm = pandas2ri.ri2py(r.colSums(counts)) pandas2ri.deactivate() return pandas_sf * pandas_cm
def computeSizeFactors(counts): """ Computes size factors using DESeq for the counts matrix given as input (Genes as rows and spots as columns). Returns the computed size factors as a vector. :param counts: a matrix of counts (genes as rows) :return returns the normalization factors a vector """ pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) deseq2 = RimportLibrary("DESeq2") dds = deseq2.estimateSizeFactorsForMatrix(r_counts) pandas_sf = pandas2ri.ri2py(dds) pandas2ri.deactivate() return pandas_sf
def plotDendogram(self, plot_filename=None, distance_method="euclidean", clustering_method="ward.D2"): r_counts = pandas2ri.py2ri(self.table) makeDendogram = R(''' function(counts){ png("%(plot_filename)s") par(mar = c(1,4,1,1)) plot(hclust(dist(t(counts), method = "%(distance_method)s"), method = "%(clustering_method)s"), main="") dev.off() }''' % locals()) makeDendogram(r_counts)
def fit(self, xtrain, ytrain): """The fit method trains R's random forest classifier. NOTE: the method name ("fit") and method signature were choosen to be consistent with scikit learn's fit method. Parameters ---------- xtrain : pd.DataFrame features for training set ytrain : pd.DataFrame true class labels (as integers) for training set """ label_counts = ytrain.value_counts() if self.is_onco_pred and self.is_tsg_pred: sampsize = [label_counts[self.other_num], label_counts[self.onco_num], label_counts[self.tsg_num]] elif self.is_onco_pred: sampsize = [label_counts[self.other_num], label_counts[self.onco_num]] elif self.is_tsg_pred: sampsize = [label_counts[self.other_num], label_counts[self.tsg_num]] self.set_sample_size(sampsize) ytrain.index = xtrain.index # ensure indexes match xtrain['true_class'] = ytrain # convert if new_pandas_flag: r_xtrain = pandas2ri.py2ri(xtrain) else: r_xtrain = com.convert_to_r_dataframe(xtrain) #ro.globalenv['trainData'] = r_xtrain self.rf = self.rf_fit(r_xtrain, self.ntrees, self.sample_size) r_imp = self.rf_imp(self.rf) # importance dataframe in R if new_pandas_flag: self.feature_importances_ = pandas2ri.ri2py(r_imp) else: self.feature_importances_ = com.convert_robj(r_imp)
def deaDESeq2(counts, conds, comparisons, alpha, size_factors=None): """Makes a call to DESeq2 to perform D.E.A. in the given counts matrix with the given conditions and comparisons. Can be given size factors. Returns a list of DESeq2 results for each comparison """ results = list() try: pandas2ri.activate() deseq2 = RimportLibrary("DESeq2") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) # Create the R conditions and counts data r_counts = pandas2ri.py2ri(counts) cond = robjects.DataFrame({"conditions": robjects.StrVector(conds)}) design = r('formula(~ conditions)') dds = r.DESeqDataSetFromMatrix(countData=r_counts, colData=cond, design=design) if size_factors is None: dds = r.DESeq(dds, parallel=True, useT=True, minmu=1e-6, minReplicatesForReplace=np.inf) else: assign_sf = r["sizeFactors<-"] dds = assign_sf(object=dds, value=robjects.FloatVector(size_factors)) dds = r.estimateDispersions(dds) dds = r.nbinomWaldTest(dds) # Perform the comparisons and store results in list for A,B in comparisons: result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha, parallel=True) result = r['as.data.frame'](result) genes = r['rownames'](result) result = pandas2ri.ri2py_dataframe(result) # There seems to be a problem parsing the rownames from R to pandas # so we do it manually result.index = genes results.append(result) pandas2ri.deactivate() except Exception as e: raise e return results
def Rtsne(counts, dimensions, theta=0.5, dims=50, perplexity=30, max_iter=1000): """Performs dimensionality reduction using the R package Rtsne""" pandas2ri.activate() r_counts = pandas2ri.py2ri(counts) tsne = RimportLibrary("Rtsne") multicore = RimportLibrary("BiocParallel") multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1)) as_matrix = r["as.matrix"] tsne_out = tsne.Rtsne(as_matrix(counts), dims=dimensions, theta=theta, check_duplicates=False, pca=True, initial_dims=dims, perplexity=perplexity, max_iter=max_iter, verbose=False) pandas_tsne_out = pandas2ri.ri2py(tsne_out.rx2('Y')) pandas2ri.deactivate() return pandas_tsne_out
# Extract required arguments. data = pd.read_table(snakemake.input.data, index_col=0) # Input Gene-by-Sample raw count data. condition = pd.read_table(snakemake.input.condition, index_col=0, names=['condition']) # Input condition file which indicates to which condition each sample belongs. logger.info('%d(genes) x %d(samples) data matrix and %d sample conditions are given.' % (data.shape[0], data.shape[1], len(condition.index))) logger.debug('Headers: %s...' % ' '.join(data.columns[:3])) logger.debug('Gene identifiers: %s...' % ' '.join(data.index[:3])) intersecting_samples = [sample for sample in data.columns if sample in condition.index] data = data[intersecting_samples] condition = list(condition.loc[intersecting_samples].condition.values) logger.info('%d samples will be used for DEG discovery.' % len(intersecting_samples)) r_data_matrix = r['data.matrix'](pandas2ri.py2ri(data)) r_samples = r.colnames(r_data_matrix) r_conditions = ro.FactorVector(condition) logger.debug('Computing size factors.') r_size_factors = ebseq.MedianNorm(r_data_matrix) logger.info('Discovering DEGs.') logger.info('Running EBTest.') num_iteration = 0 while True: # Increase iteration numbers if the conditons are not met. # Hopefully most of the tie, 10 iterations will be enough for convergence. num_iteration += 10 r_eb_out = ebseq.EBTest(Data=r_data_matrix, Conditions=r_conditions, sizeFactors=r_size_factors, maxround=num_iteration)
# %% # %load_ext rpy2.ipython # %% # Convert data to R format from rpy2.robjects import pandas2ri R_data = no_repeat_data >> select( X["Subject ID"], X["Condition"], X["Stimulus Type"], X["Switch Rate"], X["Rating"], ) R_data = pandas2ri.py2ri(R_data) R_data.head() # %% {"magic_args": "-i R_data -o anova_model,anova_model_summary,ref_poly,poly_contrasts", "language": "R"} # library(afex) # library(lsmeans) # # afex_options(emmeans_model = "multivariate") # # afex_options("emmeans_mode") # # model = afex_options("emmeans_model") # # # Convert to numeric, due to pandas converting everything to strings # R_data <- transform(R_data, Subject.ID = as.numeric(Subject.ID)) # R_data <- transform(R_data, Rating = as.numeric(Rating)) # R_data <- transform(R_data, Switch.Rate = as.numeric(Switch.Rate)) # anova_model <- aov_ez( # "Subject.ID",
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = myCommandLine.args['outDir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = myCommandLine.args['prefix'] formula = myCommandLine.args['formula'] threads = myCommandLine.args['threads'] print("running DRIMSEQ %s" % prefix, file=sys.stderr) # import from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') drim = importr('DRIMSeq') # get quant table and formula table quantDF = pd.read_table(matrix, header=0, sep='\t', index_col=0) df = pandas2ri.py2ri(quantDF) formulaDF = pd.read_csv(formula,header=0, sep="\t") pydf = pandas2ri.py2ri(formulaDF) # Convert pandas to R data frame. samples = pydf counts = df # DRIMSEQ part. # Forumla if "batch" in list(formulaDF): R.assign('batch', samples.rx2('batch')) R.assign('condition', samples.rx2('condition')) R.assign('counts', counts) R.assign('samples',samples) R.assign('numThread', threads) R.assign("cooef", "condition%s" % group2) R('data <- dmDSdata(counts = counts, samples = samples)') R('filtered <- dmFilter(data, min_samps_gene_expr = 6, min_samps_feature_expr = 3, min_gene_expr = 15, min_feature_expr = 5)') if "batch" in list(formulaDF): R('design_full <- model.matrix(~ condition + batch, data = samples(filtered))') else: R('design_full <- model.matrix(~ condition, data = samples(filtered))') R('set.seed(123)') R('d <- dmPrecision(filtered, design = design_full, BPPARAM=BiocParallel::MulticoreParam(numThread))') R('d <- dmFit(d, design = design_full, verbose = 1, BPPARAM=BiocParallel::MulticoreParam(numThread))') R('contrast <- grep("condition",colnames(design_full),value=TRUE)') R('d <- dmTest(d, coef = contrast, verbose = 1, BPPARAM=BiocParallel::MulticoreParam(numThread))') res = R('merge(proportions(d),results(d,level="feature"), by=c("feature_id","gene_id"))') data_folder = os.path.join(os.getcwd(), outdir) resOut = os.path.join(data_folder, "%s_%s_v_%s_drimseq2_results.tsv" % (prefix,group1,group2)) res.to_csv(resOut, sep='\t') sys.exit(1) R('library(stageR)') R('pScreen <- results(d)$pvalue') R('names(pScreen) <- results(d)$gene_id') ## Assign transcript-level pvalues to the confirmation stage R('pConfirmation <- matrix(results(d, level = "feature")$pvalue, ncol = 1)') R('rownames(pConfirmation) <- results(d, level = "feature")$feature_id') ## Create the gene-transcript mapping R('tx2gene <- results(d, level = "feature")[, c("feature_id", "gene_id")]') ## Create the stageRTx object and perform the stage-wise analysis R('stageRObj <- stageRTx(pScreen = pScreen, pConfirmation = pConfirmation, pScreenAdjusted = FALSE, tx2gene = tx2gene)') R('stageRObj <- stageWiseAdjustment(object = stageRObj, method = "dtu", alpha = 0.05)') R('getSignificantGenes(stageRObj)') R('getSignificantTx(stageRObj)') R('padj <- getAdjustedPValues(stageRObj, order = TRUE, onlySignificantGenes = FALSE)') R('head(padj)')
print "++++++++++++++++++++++++ tSeries ",colId, param, "++++++++++++++++++++++++" df1 = pd.DataFrame(columns=grpLbls) prmValDict = {x:[] for i,x in enumerate(grpLbls)} nFly=0 for g_,gtype in enumerate(genotypes): dSets = [[x[colId] for i_,x in enumerate(pooledTotalDataTmSrs[gtype][tmPt])] for tmPt in xrange(nTmpts)] for i,d in enumerate(dSets): flyNum = list(np.arange(nFly,nFly+len(d))) tPts = list(np.zeros(len(d))+i) gtypeList = [gtype for x in xrange(len(d))] dfData = {'result':d, 'timePoint': tPts,'flyNumber': flyNum, 'genotype': gtypeList} for j,l in enumerate(grpLbls): prmValDict[l].extend(dfData[l]) nFly+=len(d) df = pd.DataFrame(prmValDict, columns=grpLbls) descStats = pd.DataFrame(pandas2ri.ri2py(fsa.Summarize(statsFormula, data = pandas2ri.py2ri(df)))) ll = nparLD.f1_ld_f1(pandas2ri.py2ri(df['result']), pandas2ri.py2ri(df['timePoint']),\ pandas2ri.py2ri(df['genotype']),\ pandas2ri.py2ri(df['flyNumber']), **{'description': 'FALSE',\ 'plot_RTE':'FALSE', 'order.warning':'FALSE', }) pdWald = r_matrix_to_data_frame(ll.rx2('Wald.test'), getLabels = True) pdAnova = r_matrix_to_data_frame(ll.rx2('ANOVA.test'), getLabels = True) pdPairComp = r_matrix_to_data_frame(ll.rx2('pair.comparison'), getLabels = False) #print ('Wald test\n%r'%pdWald) #print ('ANOVA test\n%r'%pdAnova) #print ('Pariwise Comparison\n%r'%pdPairComp) tmPts = [str(i) for i in xrange(nTmpts)]
last_s2oe_id = 0 s2oe_sql = "SELECT objectid FROM tmp_csci_suppl2_oe WHERE objectid IN (SELECT MAX(objectid) FROM tmp_csci_suppl2_oe)" s2oe_result = smc_eng.execute(s2oe_sql) for c in s2oe_result: last_s2oe_id = c[0] # group stationcode to get just one single_station = group.StationCode.unique() # to do - check to make sure there is only one but there should only be one print "stations_grouped: %s" % single_station[0] # find stationcode that matches between the bug record and what is in stations station = stations.loc[stations['stationcode'] == single_station[0]] # convert station to r dataframe station = pandas2ri.py2ri(station) # copy of group group_copy = group # make pandas dataframe to r dataframe group = pandas2ri.py2ri(group) # Only run cleandata if every FinalID in group is found in master list (metadata.rdata in BMIMetrics/inst). -Jordan 5/6/2019 finalids = set(pandas2ri.ri2py(loadMetaData()).FinalID.tolist()) group_ids = set(group_copy.FinalID.tolist()) unknown_ids = group_ids - finalids if unknown_ids: errorLog("Sample %s has the following unknown FinalIDs: %s" % (bug_sample_id, list(unknown_ids))) msgs.append("Sample % has the following unknown FinalIDs:\n")
def cellcyclemodel(data, datarange, bins): global g1modelr global g2modelr histogram = np.histogram(data, range=datarange, bins=bins) histogramx = solim._middler(histogram[1]) histogramy = histogram[0] xdis = histogramx[1] - histogramx[0] #find G1 max g1maxy = np.max(histogramy) g1maxset = np.where(histogramy > (0.8 * g1maxy)) g1maxleft = g1maxset[0][0] g1maxright = g1maxset[0][-1] g1maxxin = np.argmax(histogramy) g1maxx = histogramx[g1maxxin] g1mean = (g1maxleft + 1 + (g1maxright - g1maxleft) / 2) * xdis robjects.r(f'g1maxy = {g1maxy}') robjects.r(f'g1mean = {g1mean}') #pick valleydivide valleydivide = float( input( 'type the x-coordinate of what looks to be the low point between the two peaks: ' )) valleydividein = np.abs(histogramx - valleydivide).argmin() valleydivide = histogramx[valleydividein] valleyheight = histogramy[valleydividein] robjects.r(f'valleyheight = {valleyheight}') robjects.r(f'valleydivide = {valleydivide}') #find G2 max g2maxy = np.max(histogramy[histogramx > valleydivide]) g2maxxin = int(2 * (g1maxleft + 1 + (g1maxright - g1maxleft) / 2)) g2maxx = histogramx[g2maxxin] robjects.r(f'g2maxy = {g2maxy}') robjects.r(f'g2mean = {g2maxx}') #pick bigdivide bigdivide = float( input( 'type the x-coordinate of where you think the upper boundary of the G2 peak is: ' )) bigdividein = np.abs(histogramx - bigdivide).argmin() bigdivide = histogramx[bigdividein] #pick debrisdivide debrisdivide = float( input( 'type the x-coordinate of where you think the boundary between the G1 peak and debris is: ' )) debrisdividein = np.abs(histogramx - debrisdivide).argmin() debrisdivide = histogramx[debrisdividein] #find g1 modeldata g1startsd = np.std(data[np.logical_and( debrisdivide < data, data < (debrisdivide + (g1maxx - debrisdivide) * 2))]) robjects.r(f'g1startsd = {g1startsd}') g1modelx = histogramx[debrisdividein:g1maxright + (g1maxleft - debrisdividein) + 1] g1modely = histogramy[debrisdividein:g1maxright + 1] g1modelyright = np.array( list(reversed(histogramy[debrisdividein:g1maxleft]))) g1modely = np.concatenate((g1modely, g1modelyright)) g1model = pd.DataFrame({'x': g1modelx, 'y': g1modely}) g1modelr = pandas2ri.py2ri(g1model) #find g2 modeldata g2startsd = np.std(data[np.logical_and( valleydivide < data, data < (valleydivide + (g2maxx - valleydivide) * 2))]) robjects.r(f'g2startsd = {g2startsd}') g2modelx = histogramx[g2maxxin - (bigdividein - g2maxxin):bigdividein + 1] g2modely = histogramy[g2maxxin:bigdividein + 1] g2modely = np.concatenate( [np.array(list(reversed(g2modely[1:]))), g2modely]) g2model = pd.DataFrame({'x': g2modelx, 'y': g2modely}) g2modelr = pandas2ri.py2ri(g2model) fulldata = pd.DataFrame({'x': histogramx, 'y': histogramy}) fulldatar = pandas2ri.py2ri(fulldata) modelwindow = pd.DataFrame({ 'x': histogramx[g1maxxin:g2maxxin + 1], 'y': histogramy[g1maxxin:g2maxxin + 1] }) modelwindowr = pandas2ri.py2ri(modelwindow) rinterface.globalenv.do_slot_assign('modelwindowr', modelwindowr) robjects.r('modelwindowr = attr(globalenv(), "modelwindowr")') rinterface.globalenv.do_slot_assign('fulldatar', fulldatar) robjects.r('fulldatar = attr(globalenv(), "fulldatar")') rinterface.globalenv.do_slot_assign('g1modelr', g1modelr) robjects.r('g1modelr = attr(globalenv(), "g1modelr")') rinterface.globalenv.do_slot_assign('g2modelr', g2modelr) robjects.r('g2modelr = attr(globalenv(), "g2modelr")') robjects.r(""" library(purrr) dnormmodel = function(data, params, mean){ return(params['height']*dnorm(x=data$x, sd=params['sd'], mean=mean)) } g1model = partial(dnormmodel, mean=g1mean) g2model = partial(dnormmodel, mean=g2mean) measure_distance = function(params, data, model){ diff = data$y - abs(model(data, params)) return(sqrt(mean(diff^2))) } g1startheight = g1maxy / dnorm(0, sd=g1startsd) g2startheight = g2maxy / dnorm(0, sd=g2startsd) g1startparams = c(height=g1startheight, sd=g1startsd) g2startparams = c(height=g1startheight, sd=g2startsd) g1res = optim(g1startparams, measure_distance, data = g1modelr, model=g1model) g2res = optim(g1startparams, measure_distance, data = g2modelr, model=g2model) g1est = g1model(data=fulldatar, params=g1res$par) g2est = g2model(data=fulldatar, params=g2res$par) sest = fulldatar$y - (g1est + g2est) """) return (robjects.r('g1res$par'), robjects.r('g2res$par'), robjects.r('g1est'), robjects.r('g2est'), robjects.r('sest'), histogramy, histogramx)
motif1 = pd.DataFrame() motif2 = pd.DataFrame() motif3 = pd.DataFrame() motif4 = pd.DataFrame() for tf in TFs: mo=os.popen("sed -n '%ip' %stf_ru_max_top4_rank_largespace/%s "%(int(start1+1),path1,tf)) m=re.split('\t|\n',mo.read()) m.pop() m=list(map(float,m)) motif1[tf]=m[:8] motif2[tf]=m[8:16] motif3[tf]=m[16:24] motif4[tf]=m[24:] motif1_r=pandas2ri.py2ri(motif1) motif2_r=pandas2ri.py2ri(motif2) motif3_r=pandas2ri.py2ri(motif3) motif4_r=pandas2ri.py2ri(motif4) robjects.r['plot_motif'](pair,motif1_r,pair+'_'+'chr'+str(chro)+'_'+str(start)+'_tf_top1') robjects.r['plot_motif'](pair,motif2_r,pair+'_'+'chr'+str(chro)+'_'+str(start)+'_tf_top2') robjects.r['plot_motif'](pair,motif3_r,pair+'_'+'chr'+str(chro)+'_'+str(start)+'_tf_top3') robjects.r['plot_motif'](pair,motif4_r,pair+'_'+'chr'+str(chro)+'_'+str(start)+'_tf_top4')
def transform(self, method="vst", design=None, inplace=True, blind=True): ''' perform transformation on counts table current methods are: - deseq2 variance stabalising transformation - deseq rlog transformation Need to supply a design table if not using "blind" ''' assert method in ["vst", "rlog"], ("method must be one of" "[vst, rlog]") method2function = { "vst": "varianceStabilizingTransformation", "rlog": "rlog" } t_function = method2function[method] r_counts = pandas2ri.py2ri(self.table) if not blind: assert design, ("if not using blind must supply a design table " "(a CGAT.Expression.ExperimentalDesign object") # currently this only accepts "~group" design transform = R(''' function(df, design){ suppressMessages(library('DESeq2')) dds <- suppressMessages(DESeqDataSetFromMatrix( countData= df, colData = design, design = ~group)) transformed <- suppressMessages(%(t_function)s(dds, blind=FALSE)) transformed_df <- as.data.frame(assay(transformed)) return(transformed_df) }''' % locals()) r_design = pandas2ri.py2ri(design.table) df = pandas2ri.ri2py(transform(r_counts, r_design)) else: transform = R(''' function(df){ suppressMessages(library('DESeq2')) design = data.frame(row.names = colnames(df), group = seq(1, length(colnames(df)))) dds <- suppressMessages(DESeqDataSetFromMatrix( countData= df, colData = design, design = ~group)) transformed <- suppressMessages(%(t_function)s(dds, blind=TRUE)) transformed_df <- as.data.frame(assay(transformed)) return(transformed_df) }''' % locals()) df = pandas2ri.ri2py(transform(r_counts)) # losing rownames for some reason during the conversion?! df.index = self.table.index if inplace: self.table = df # R replaces "-" in column names with ".". Revert back! self.table.columns = [ x.replace(".", "-") for x in self.table.columns ] else: tmp_counts = self.clone() tmp_counts.table = df tmp_counts.table.columns = [ x.replace(".", "-") for x in tmp_counts.table.columns ] return tmp_counts
def plotPCA(self, design, variance_plot_filename=None, pca_plot_filename=None, x_axis="PC1", y_axis="PC2", colour="group", shape="group"): ''' use the prcomp function in base R to perform principal components analysis. Can specify colour and shape as either variables from design table or sample names (seperated into id_1, id_2, id_3 based on samples having names formated e.g Tissue-Treatment-Replicate)''' # TS: swap this for regexes assert (x_axis[0:2] == "PC" and y_axis[0:2] == "PC"),\ "x_axis and y_axis names must start with 'PC'" r_counts = pandas2ri.py2ri(self.table) r_design = pandas2ri.py2ri(design.table) pc_number_1 = int(x_axis.replace("PC", "")) pc_number_2 = int(y_axis.replace("PC", "")) makePCA = R(''' function(counts, design){ suppressMessages(library(ggplot2)) suppressMessages(library(grid)) gene_pca <- prcomp(t(counts), center = TRUE) m_text = element_text(size=12) s_text = element_text(size=8) variance = gene_pca$sdev^2 variance_explained = round(variance/sum(variance), 5) variance_df = data.frame("Variance_explained" = variance_explained, "PC" = seq(1, length(variance))) p_variance = ggplot(variance_df, aes(x=PC, y=Variance_explained))+ geom_point()+ geom_line()+ theme_bw()+ ylab("Variance explained (%%)")+ theme(axis.text.x = m_text, axis.title.y = m_text, axis.title.x = m_text, axis.text.y = m_text) ggsave("%(variance_plot_filename)s", width=10, height=10, unit="cm") PCs_df = data.frame(gene_pca$x) PCs_df['sample'] <- rownames(PCs_df) design['sample'] <- gsub("-", ".", rownames(design)) PCs_df = merge(PCs_df, design) PCs_df$id_1 = sapply(strsplit(PCs_df$sample, "\\\."), "[", 1) PCs_df$id_2 = sapply(strsplit(PCs_df$sample, "\\\."), "[", 2) PCs_df$id_3 = sapply(strsplit(PCs_df$sample, "\\\."), "[", 3) p_pca = ggplot(PCs_df, aes(x=%(x_axis)s, y=%(y_axis)s)) + geom_point(size=3, aes(shape=as.factor(%(shape)s), colour=as.factor(%(colour)s))) + scale_colour_discrete(name=guide_legend(title='%(colour)s')) + scale_shape_discrete(name=guide_legend(title='%(shape)s')) + xlab(paste0('PC%(pc_number_1)i (Variance explained = ' , round(100 * variance_explained[%(pc_number_1)i], 1), '%%)')) + ylab(paste0('PC%(pc_number_2)i (Variance explained = ' , round(100 * variance_explained[%(pc_number_2)i], 1), '%%)')) + theme_bw() + theme(axis.text.x = s_text, axis.text.y = s_text, title = m_text, legend.text = m_text, legend.title = m_text) ggsave("%(pca_plot_filename)s", width=10, height=10, unit="cm") }''' % locals()) makePCA(r_counts, r_design)
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = myCommandLine.args['outDir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = myCommandLine.args['prefix'] formula = myCommandLine.args['formula'] print("running DESEQ2 %s" % prefix, file=sys.stderr) # make the quant DF quantDF = pd.read_table(matrix, header=0, sep='\t', index_col=0) df = pandas2ri.py2ri(quantDF) # import formula formulaDF = pd.read_csv(formula,header=0, sep="\t",index_col=0) sampleTable = pandas2ri.py2ri(formulaDF) if "batch" in list(formulaDF): design = Formula("~ batch + condition") else: design = Formula("~ condition") # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') ### RUN DESEQ2 ### R.assign('df', df) R.assign('sampleTable', sampleTable) R.assign('design',design) R('dds <- DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design)') R('dds <- DESeq(dds)') R('name <- grep("condition", resultsNames(dds), value=TRUE)') ### ### # Get Results and shrinkage values res = R('results(dds, name=name)') resLFC = R('lfcShrink(dds, coef=name)') vsd = R('vst(dds,blind=FALSE)') resdf = robjects.r['as.data.frame'](res) reslfc = robjects.r['as.data.frame'](resLFC) dds = R('dds') ### Plotting section ### # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] # get pca data if "batch" in list(formulaDF): pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") else: print(vsd) pcaData = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange data_folder = os.path.join(os.getcwd(), outdir) qcOut = os.path.join(data_folder, "%s_QCplots_%s_v_%s.pdf" % (prefix,group1,group2)) grdevices.pdf(file=qcOut) x = "PC1: %s" % int(percentVar[0]*100) + "%% variance" y = "PC2: %s" % int(percentVar[1]*100) + "%% variance" if "batch" in list(formulaDF): pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() else: pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrhinkage") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() + \ ggplot2.ggtitle("pvalue distribution") hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() data_folder = os.path.join(os.getcwd(), outdir) lfcOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (prefix,group1,group2)) resOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results.tsv" % (prefix,group1,group2)) robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t") robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
def taxonomy(all_dataframes, sql_match_tables, errors_dict, project_code, login_info): errorLog("Function - taxonomy") message = "Custom Taxonomy: Start checks." statusLog("Starting Taxonomy Checks") errorLog(message) errorLog("project code: %s" % project_code) login_info = login_info.strip().split("-") login = str(login_info[0]) agency = str(login_info[1]) owner = str(login_info[2]) year = str(login_info[3]) project = str(login_info[4]) assignment_table = "" custom_checks = "" summary_checks = "" summary_results_link = "" custom_redundant_checks = "" custom_errors = [] custom_warnings = [] custom_redundant_errors = [] custom_redundant_warnings = [] TIMESTAMP = str(session.get('key')) ### get date and time gettime = int(time.time()) timestamp_date = datetime.datetime.fromtimestamp(gettime) # add submitted table names to list tables = [] # match tablenames to tabs errorLog(all_dataframes.keys()) for dataframe in all_dataframes.keys(): df_sheet_and_table_name = dataframe.strip().split(" - ") errorLog(df_sheet_and_table_name) table_name = str(df_sheet_and_table_name[2]) errorLog(table_name) if table_name == "tbl_taxonomysampleinfo": tables.append("sampleinfo") sampleinfo = all_dataframes[dataframe] sampleinfo['tmp_row'] = sampleinfo.index if table_name == "tbl_taxonomyresults": tables.append("result") result = all_dataframes[dataframe] result['tmp_row'] = result.index try: ##################### ## CHECK FUNCTIONS ## ##################### def checkData(statement, column, warn_or_error, error_label, human_error, dataframe): errorLog("checkData warn_or_error: %s" % error_label) for item_number in statement: unique_error = '{"column": "%s", "error_type": "%s", "error": "%s"}' % ( column, warn_or_error, human_error) if error_label == 'error': addErrorToList("custom_errors", item_number, unique_error, dataframe) errorsCount(errors_dict, 'custom') if error_label == 'warning': addErrorToList("custom_errors", item_number, unique_error, dataframe) # do not count warnings as errors - submission allowed - errorsCount('custom') def checkLogic(statement, column, warn_or_error, error_label, human_error, dataframe): for item_number in statement: unique_error = '{"column": "%s", "error_type": "%s", "error": "%s"}' % ( column, warn_or_error, human_error) addErrorToList("custom_errors", item_number, unique_error, dataframe) errorsCount(errors_dict, 'custom') ################## ## LOGIC CHECKS ## ################## errorLog("Starting Taxonomy Logic Checks") statusLog("Starting Taxonomy Logic Checks") # each sampleinfo information record must have a corresponding result record. records are matched on stationcode, sampledate, fieldreplicate. errorLog( "## EACH SAMPLEINFO INFORMATION RECORD MUST HAVE A CORRESPONDING RESULT RECORD. RECORDS ARE MATCHED ON STATIONCODE, SAMPLEDATE, FIELDREPLICATE ##" ) errorLog(sampleinfo[ ~sampleinfo[['stationcode', 'sampledate', 'fieldreplicate']]. isin(result[['stationcode', 'sampledate', 'fieldreplicate']]. to_dict(orient='list')).all(axis=1)]) checkLogic( sampleinfo[ ~sampleinfo[['stationcode', 'sampledate', 'fieldreplicate']]. isin(result[['stationcode', 'sampledate', 'fieldreplicate']]. to_dict(orient='list')).all(axis=1)].index.tolist(), 'StationCode/SampleDate/FieldReplicate', 'Logic Error', 'error', 'Each Taxonomy SampleInfo record must have a corresponding Taxonomy Result record. Records are matched on StationCode,SampleDate, and FieldReplicate.', sampleinfo) errorLog(result[ ~result[['stationcode', 'sampledate', 'fieldreplicate']]. isin(sampleinfo[['stationcode', 'sampledate', 'fieldreplicate']]. to_dict(orient='list')).all(axis=1)]) checkLogic( result[~result[['stationcode', 'sampledate', 'fieldreplicate']]. isin(sampleinfo[ ['stationcode', 'sampledate', 'fieldreplicate']]. to_dict(orient='list')).all(axis=1)].index.tolist(), 'StationCode/SampleDate/FieldReplicate', 'Logic Error', 'error', 'Each Taxonomy Result record must have a corresponding Taxonomy SampleInfo record. Records are matched on StationCode,SampleDate, and FieldReplicate.', result) ################### ## CUSTOM CHECKS ## ################### message = "Starting Custom Taxonomy Checks" errorLog(message) statusLog(message) ## Jordan - Taxonomicqualifier Multi Value Lookup List: check to make sure taxonomicqualifier field data is valid (multiple values may be accepted). errorLog(result['taxonomicqualifier']) errorLog( "Taxonomicqualifier Multi Value Lookup List: check to make sure taxonomicqualifier field data is valid (multiple values may be accepted)." ) nan_rows, invalid_codes, subcodes = dcValueAgainstMultipleValues( current_app.eng, 'lu_taxonomicqualifier', 'taxonomicqualifiercode', result, 'taxonomicqualifier') errorLog("Check submitted data for at least one code:") checkData( nan_rows, 'TaxonomicQualifier', 'Custom Error', 'error', 'At least one TaxonomicQualifier code required please check the list: <a href=http://smcchecker.sccwrp.org/smc/scraper?action=help&layer=lu_taxonomicqualifier target=_blank>lu_taxonomicqualifier</a>.', result) errorLog( "Check submitted data for invalid code (or code combination):") checkData( invalid_codes, 'TaxonomicQualifier', 'Custom Error', 'error', 'At least one TaxonomicQualifier code is invalid please check the list: <a href=http://smcchecker.sccwrp.org/smc/scraper?action=help&layer=lu_taxonomicqualifier target=_blank>lu_taxonomicqualifier</a>', result) ## Jordan - Sample/Result SampleDate field - make sure user did not accidentally drag down date errorLog( 'Sample/Result SampleDate field - make sure user did not accidentally drag down date' ) # If every date submitted is consecutive from the first, it will error out every row. Otherwise, no error is thrown. if sampleinfo.sampledate.diff()[1:].sum() == pd.Timedelta( '%s day' % (len(sampleinfo) - 1)): checkData( sampleinfo.loc[sampleinfo.sampledate.diff() == pd.Timedelta( '1 day')].tmp_row.tolist(), 'SampleDate', 'Custom Error', 'Error', 'Consecutive Dates. Make sure you did not accidentally drag down the date', sampleinfo) if result.sampledate.diff()[1:].sum() == pd.Timedelta( '%s day' % (len(result) - 1)): checkData( result.loc[result.sampledate.diff() == pd.Timedelta( '1 day')].tmp_row.tolist(), 'SampleDate', 'Custom Error', 'Error', 'Consecutive Dates. Make sure you did not accidentally drag down the date', result) ## Jordan - FinalID / LifeStageCode combination must match combination found in vw_organism_lifestage_lookup errorLog( 'FinalID / LifeStageCode combination must match combination found in vw_organism_lifestage_lookup' ) # build list of FinalID/LifeStageCode combinations from lookup lists eng = create_engine('postgresql://*****:*****@192.168.1.17:5432/smc') lu_organisms = "SELECT organismcode, finalid, lifestagecode FROM vw_organism_lifestage_lookup;" #lu_organismdetaillookup = "SELECT organismcode, lifestagecode FROM lu_organismdetaillookup;" organisms = pd.read_sql_query(lu_organisms, eng) #organismdetaillookup = pd.read_sql_query(lu_organismdetaillookup,eng) #valid_pairs = organisms.merge(organismdetaillookup, on = ['organismcode'], how = 'inner') valid_pairs_list = list(organisms['finalid'] + '_' + organisms['lifestagecode']) # compare pairs of submitted FinalID / LifeStageCode to valid_pairings from lookup lists errorLog( "result where FinalID/LifeStageCode does not match pair from lookup list:" ) errorLog( result[pd.Series(result.finalid + '_' + result.lifestagecode).isin(valid_pairs_list)]) # perform check on data checkData( result[~pd.Series(result.finalid + '_' + result.lifestagecode). isin(valid_pairs_list)].tmp_row.tolist(), 'FinalID/LifeStageCode', 'Undefined Error', 'error', 'FinalID/LifeStageCode pair is not valid. Refer to <a href=http://smcchecker.sccwrp.org/smc/scraper?action=help&layer=vw_organism_lifestage_lookup target=_blank>vw_organism_lifestage_lookup</a> for valid pairings', result) ##################### ## START MAP CHECK ## ##################### # get a unique list of stations from results file rlist_of_stations = pd.unique(result['stationcode']) result_unique_stations = ','.join("'" + s + "'" for s in rlist_of_stations) ################ ## NEW FIELDS ## ################ sampleinfo['project_code'] = project_code result['project_code'] = project_code ############################ Note: failure to run csci should not result in a failure ## BUILD and Process CSCI ## to submit data - csci status should always = 0 ############################ # Dont run csci code if there are custom errors - data must be clean total_count = errors_dict['total'] errorLog("total error count: %s" % total_count) errorLog("project code: %s" % project_code) if total_count == 0: message = "Starting CSCI Processing..." errorLog(message) statusLog(message) msgs = [] # combine results and sampleinfo on stationcode we want to get collectionmethod field from sampleinfo bugs = pd.merge( result, sampleinfo[[ 'stationcode', 'fieldsampleid', 'fieldreplicate', 'collectionmethodcode' ]], on=['stationcode', 'fieldsampleid', 'fieldreplicate'], how='left') # original submitted stations list_of_original_unique_stations = pd.unique(bugs['stationcode']) errorLog("list_of_original_unique_stations:") errorLog(list_of_original_unique_stations) unique_original_stations = ','.join( "'" + s + "'" for s in list_of_original_unique_stations) # concatenate stationcode, sampledate, collectionmethod, fieldreplicate into one field called sampleid errorLog("create sampleid:") # first get adjusted date bugs["samplerealdate"] = bugs["sampledate"].dt.strftime( '%m%d%Y').map(str) bugs["samplemonth"] = bugs["sampledate"].dt.strftime('%m').map(str) bugs["sampleday"] = bugs["sampledate"].dt.strftime('%d').map(str) bugs["sampleyear"] = bugs["sampledate"].dt.strftime('%Y').map(str) # merge two bugs["codeanddate"] = bugs.stationcode.astype(str).str.cat( bugs['samplerealdate'], sep='_') # merge two bugs["collectionandreplicate"] = bugs.collectionmethodcode.astype( str).str.cat(bugs['fieldreplicate'].astype(str), sep='_') # merge both bugs["sampleid"] = bugs.codeanddate.str.cat( bugs.collectionandreplicate, sep='_') # drop temp columns bugs.drop( ['samplerealdate', 'codeanddate', 'collectionandreplicate'], axis=1, inplace=True) # BUGS IS BUILT OFF THE MERGENCE OF BUG FILE AND GISSTATIONCODEXWALK # BUT STATIONCODE SHOULD ACTUALLY BE GISCODE NOT STATIONCODE # ResultsTable:StationCode links to Crosswalk:StationCode, which links to GISMetrics:GISCode # call gisxwalk table using unique stationcodes and get databasecode and giscode errorLog("building xwalk...") eng = create_engine( 'postgresql://*****:*****@192.168.1.17:5432/smc') sqlwalk = 'select stationcode,databasecode,giscode from lu_newgisstationcodexwalk where stationcode in (%s)' % unique_original_stations gisxwalk = pd.read_sql_query(sqlwalk, eng) #bugs = pd.merge(bugs,gisxwalk[['stationcode','giscode','databasecode']], on = ['stationcode'], how='inner') bugs = pd.merge( bugs, gisxwalk[['stationcode', 'giscode', 'databasecode']], on=['stationcode'], how='inner') # only process stations that have associated gismetric data missing_bugs_xwalk = set(list_of_original_unique_stations) - set( bugs.stationcode.tolist()) # send email if stations missing GIS Metric data. if missing_bugs_xwalk: bad_stations = '\n'.join(str(x) for x in missing_bugs_xwalk) msgs.append('CSCI Error:\n') msgs.append( 'The following stations are missing GISXWalk data:\n') msgs.append(bad_stations) print msgs # original stations translated to smc stations using giscode list_of_unique_stations = pd.unique(bugs['giscode']) errorLog("list_of_unique_stations:") errorLog(list_of_unique_stations) unique_stations = ','.join("'" + s + "'" for s in list_of_unique_stations) #### STATIONS IS BUILT OFF THE MERGENCE OF BUG FILE AND GISMETRICS errorLog("building gismetrics...") sqlmetrics = 'select * from tbl_newgismetrics' gismetrics = pd.read_sql_query(sqlmetrics, eng) # merge gismetrics and gisxwalk to get giscode into dataframe # merge bugs/stationcode and gismetrics/giscode # check stations test_stations = pd.unique(bugs['stationcode']) # problem - gismetrics stationcode is replacing bugs-originalsubmission stationcode thats a problem errorLog(test_stations) # copy bugs.stationcode to retain in stations below bugs['original_stationcode'] = bugs['stationcode'] stations = pd.merge(gismetrics, bugs[['giscode', 'original_stationcode']], left_on=['stationcode'], right_on=['giscode'], how='inner') # drop gismetrics stationcode stations.drop(['stationcode'], axis=1, inplace=True) stations.rename(columns={'original_stationcode': 'stationcode'}, inplace=True) eng.dispose() # check stations test2_stations = pd.unique(stations['stationcode']) errorLog(test2_stations) # only process stations that have associated gismetric data missing_bugs_stations = set(list_of_unique_stations) - set( bugs.giscode.tolist()) missing_stations_stations = set(list_of_unique_stations) - set( stations.giscode.tolist()) # send email if stations missing GIS Metric data. if missing_bugs_stations | missing_stations_stations: bad_stations = '\n'.join( str(x) for x in missing_bugs_stations.union( missing_stations_stations)) msgs.append('CSCI Error:\n') msgs.append( 'The following stations are missing GISMetric data:\n') msgs.append(bad_stations) print msgs # drop unnecessary columns bugs.drop(bugs[[ 'fieldsampleid', 'unit', 'excludedtaxa', 'personnelcode_labeffort', 'personnelcode_results', 'enterdate', 'taxonomicqualifier', 'qacode', 'resqualcode', 'labsampleid', 'benthicresultscomments', 'agencycode_labeffort', 'tmp_row', 'result' ]], axis=1, inplace=True) # if row exists drop row, errors, and lookup_error if 'row' in bugs.columns: bugs.drop(bugs[['row', 'errors']], axis=1, inplace=True) if 'lookup_error' in bugs.columns: bugs.drop(bugs[['lookup_error']], axis=1, inplace=True) stations.drop(stations[['objectid', 'gdb_geomattr_data', 'shape']], axis=1, inplace=True) # rename field bugs = bugs.rename( columns={ 'stationcode': 'StationCode', 'sampledate': 'SampleDate', 'fieldreplicate': 'FieldReplicate', 'collectionmethodcode': 'CollectionMethodCode', 'finalid': 'FinalID', 'lifestagecode': 'LifeStageCode', 'baresult': 'BAResult', 'databasecode': 'DatabaseCode', 'sampleid': 'SampleID', 'distinctcode': 'Distinct' }) errorLog(bugs) # drop all duplicates stations.drop_duplicates(inplace=True) errorLog(stations) bugs_count = len(bugs.index) stations_count = len(stations.index) errorLog("bugs_count:") errorLog(bugs_count) errorLog("stations_count:") errorLog(stations_count) # UPDATE: If bugs or stations are empty, CSCI cannot be processed. -Jordan 4/23/2019 if bugs_count == 0 or stations_count == 0: errorLog( "bugs and stations could not be built. Do not process CSCI." ) checkData( sampleinfo.tmp_row.tolist(), 'stationcode', 'Undefined Warning', 'warning', 'The data you submitted does not meet the minimum requirements to process CSCI. You may continue submitting, but CSCI Reports cannot be generated at this time.', sampleinfo) else: # Import and Execute cleanData and CSCI functions import rpy2 import rpy2.robjects as robjects from rpy2.robjects import pandas2ri import rpy2.robjects.packages as rpackages from rpy2.robjects.packages import importr import rpy2.rinterface as rinterface # shortens notation for accessing robjects r = robjects.r # imports R package: CSCI CSCI = importr('CSCI') # convert cleanData() and CSCI() functions from CSCI package to python cd = CSCI.cleanData csci = CSCI.CSCI # collect errors and error counts for each group error_count = {'clean data': 0, 'CSCI': 0} cd_group_errors = [] csci_group_errors = [] # process cleanData and CSCI for each Sample bugs_grouped = bugs.groupby(['SampleID']) # open log file for printing status TIMESTAMP = str(int(round(time.time() * 1000))) logfile = '/var/www/smc/testfiles/' + TIMESTAMP + '.log' # Activate R to Python DataFrame conversions pandas2ri.activate() start_time = int(time.time()) count = 0 for name, group in bugs_grouped: # print current group print "group name: %s" % (name) bug_sample_id = name # group stationcode to get just one single_station = group.StationCode.unique() # check to makesure there is only one print "stations_grouped: %s" % single_station[0] # find stationcode that matches between the bugs record and what is in stations station = stations.loc[stations['stationcode'] == single_station[0]] # convert group, station to R dataframe errorLog("convert group, station to R dataframe") group = pandas2ri.py2ri(group) station = pandas2ri.py2ri(station) # copy of group errorLog( "make a copy of group and adjust sampledate fields") group_copy = group #errorLog("group_copy:") #errorLog(group_copy) group_copy = pandas2ri.ri2py(group_copy) #errorLog("list group copy:") #errorLog(list(group_copy)) group_copy.columns = [ x.lower() for x in group_copy.columns ] # get samplemonth, sampleday, sampleyear for later use #group_copy["sampledate"] = pd.datetime.strptime(group_copy['sampledate'], '%Y-%m-%d') #group_copy["samplemonth"] = group_copy.sampledate.dt.month #group_copy["sampleday"] = group_copy.sampledate.dt.day #group_copy["sampleyear"] = group_copy.sampledate.dt.year ''' # clean group with cleanData() cd_list = cd(group,msgs=True) group = cd_list[0] warn_msg = cd_list[1] # if data cannot be cleaned, prepare email message if warn_msg[0] != 'Data already clean': errorLog('cleanData Failed:\n') bad_station = 'cleanData failed on station %s:\n' %single_station[0] bad_group = 'Sample %s could not be cleaned because %s.' %(bug_sample_id,warn_msg[0]) errorLog(bad_station) errorLog(bad_group) msgs.append('CSCI Error:\n') msgs.append(bad_station) msgs.append(bad_group) else: ''' try: errorLog("data is clean process csci") errorLog(station) errorLog(group) report = csci(group, station) # assign csci elements to proper tables errorLog("assign elements to specific tables") core = pandas2ri.ri2py(report[0]) s1mmi = pandas2ri.ri2py(report[1]) s1grps = pandas2ri.ri2py(report[2]) s1oe = pandas2ri.ri2py(report[3]) s2oe = pandas2ri.ri2py(report[4]) s2mmi = pandas2ri.ri2py(report[5]) # fields that need to be filled errorLog("first - csci") errorLog(core) core.columns = [x.lower() for x in core.columns] core['processed_by'] = "checker" core['cleaned'] = "Yes" core['scorenotes'] = "Distinct set to NA" core['rand'] = 2 core['scoredate'] = timestamp_date core[ 'record_origin'] = project # should probably be SMC core['origin_lastupdatedate'] = timestamp_date core['record_publish'] = "False" core = pd.merge(core, group_copy[[ 'sampleid', 'sampledate', 'sampleday', 'samplemonth', 'sampleyear', 'collectionmethodcode', 'fieldreplicate' ]], on=['sampleid'], how='left') core['sampledate'] = pd.to_datetime(core['sampledate'], unit='s').dt.date core = core.drop_duplicates() core_file = "/var/www/smc/logs/%s.core.csv" % TIMESTAMP # only show header once if count == 0: core.to_csv(core_file, sep=',', mode='a', encoding='utf-8', index=False) else: # skip next loop core.to_csv(core_file, sep=',', mode='a', encoding='utf-8', index=False, header=False) errorLog("second - s1mmi") s1mmi.columns = [x.lower() for x in s1mmi.columns] s1mmi['processed_by'] = "checker" s1mmi.rename(columns={ 'coleoptera_percenttaxa_predicted': 'coleoptera_percenttaxa_predict' }, inplace=True) s1mmi[ 'record_origin'] = project # should probably be SMC s1mmi['origin_lastupdatedate'] = timestamp_date s1mmi['record_publish'] = "False" s1mmi = s1mmi.drop_duplicates() s1mmi_file = "/var/www/smc/logs/%s.Suppl1_mmi.csv" % TIMESTAMP # only show header once if count == 0: s1mmi.to_csv(s1mmi_file, sep=',', mode='a', encoding='utf-8', index=False) else: # skip next loop s1mmi.to_csv(s1mmi_file, sep=',', mode='a', encoding='utf-8', index=False, header=False) errorLog("third - s2mmi") s2mmi.columns = [x.lower() for x in s2mmi.columns] s2mmi['processed_by'] = "checker" s2mmi[ 'record_origin'] = project # should probably be SMC s2mmi['origin_lastupdatedate'] = timestamp_date s2mmi['record_publish'] = "False" s2mmi = s2mmi.drop_duplicates() s2mmi_file = "/var/www/smc/logs/%s.Suppl2_mmi.csv" % TIMESTAMP # only show header once if count == 0: s2mmi.to_csv(s2mmi_file, sep=',', mode='a', encoding='utf-8', index=False) else: # skip next loop s2mmi.to_csv(s2mmi_file, sep=',', mode='a', encoding='utf-8', index=False, header=False) errorLog("fourth - s1grps") s1grps.columns = [x.lower() for x in s1grps.columns] s1grps['processed_by'] = "checker" s1grps[ 'record_origin'] = project # should probably be SMC s1grps['origin_lastupdatedate'] = timestamp_date s1grps['record_publish'] = "False" s1grps = s1grps.drop_duplicates() s1grps_file = "/var/www/smc/logs/%s.Suppl1_grps.csv" % TIMESTAMP # only show header once if count == 0: s1grps.to_csv(s1grps_file, sep=',', mode='a', encoding='utf-8', index=False) else: # skip next loop s1grps.to_csv(s1grps_file, sep=',', mode='a', encoding='utf-8', index=False, header=False) errorLog("fifth - s1oe") s1oe.columns = [x.lower() for x in s1oe.columns] #print s1oe #s1oe['objectid'] = s1oe.apply(lambda x: int(x.objectid) + x.index, axis=1) s1oe['processed_by'] = "checker" s1oe[ 'record_origin'] = project # should probably be SMC s1oe['origin_lastupdatedate'] = timestamp_date s1oe['record_publish'] = "False" s1oe = s1oe.drop_duplicates() s1oe_file = "/var/www/smc/logs/%s.Suppl1_OE.csv" % TIMESTAMP # only show header once if count == 0: s1oe.to_csv(s1oe_file, sep=',', mode='a', encoding='utf-8', index=False) else: # skip next loop s1oe.to_csv(s1oe_file, sep=',', mode='a', encoding='utf-8', index=False, header=False) errorLog("sixth - s2oe") s2oe.columns = [x.lower() for x in s2oe.columns] # fill na with -88 #s2oe.fillna(-88, inplace=True) s2oe['captureprob'].replace(['NA'], -88, inplace=True) s2oe['processed_by'] = "checker" s2oe[ 'record_origin'] = project # should probably be SMC s2oe['origin_lastupdatedate'] = timestamp_date s2oe['record_publish'] = "False" s2oe = s2oe.drop_duplicates() s2oe_file = "/var/www/smc/logs/%s.Suppl2_OE.csv" % TIMESTAMP # only show header once if count == 0: s2oe.to_csv(s2oe_file, sep=',', mode='a', encoding='utf-8', index=False) else: # skip next loop s2oe.to_csv(s2oe_file, sep=',', mode='a', encoding='utf-8', index=False, header=False) summary_results_link = TIMESTAMP count = count + 1 #file_to_get = "/var/www/smc/logs/%s.core.csv" % TIMESTAMP #errorLog("file to get:") #errorLog(file_to_get) #all_dataframes["2 - core_csv - tmp_cscicore"] = pd.read_csv('/var/www/smc/logs/%s.core.csv' % TIMESTAMP) #all_dataframes["2 - core_csv - tmp_cscicore"].columns = [x.lower() for x in all_dataframes["2 - core_csv - tmp_cscicore"].columns] ## WHAT HAPPENS IF CSCI SCORE IS ALREADY IN DATABASE - MAY WANT TO CHECK ABOVE #errorLog("print core_csv columns:") #errorLog(list(all_dataframes["2 - core_csv - tmp_cscicore"])) #errorLog("remove index:") #all_dataframes["2 - core_csv - tmp_cscicore"].drop(['unnamed: 0'],axis=1, inplace=True) #errorLog(list(all_dataframes["2 - core_csv - tmp_cscicore"])) #errorLog(all_dataframes["2 - core_csv - tmp_cscicore"]) # #summary_results_link = 'http://smcchecker.sccwrp.org/smc/logs/%s.core.csv' % TIMESTAMP #summary_results_link = TIMESTAMP ### IMPORTANT LOAD ONE CSCI FIELD FROM CSV FILE AND MAP IT TO EXISTING BUGS/STATIONS DATAFRAME THEN OUTPUT TO CSV LOAD FILE FOR IMPORT ### AT STAGING INTO DATABASES message = "Success CSCI" errorLog(message) ''' # code below wont work do to sampledate getting changed to number instead of date - fails on submission all_dataframes["2 - CSCI_Core - csci_core"] = core all_dataframes["3 - CSCI_Suppl1_MMI - csci_suppl1_mmi"] = s1mmi all_dataframes["4 - CSCI_Suppl2_MMI - csci_suppl2_mmi"] = s2mmi all_dataframes["5 - CSCI_Suppl1_GRPS - csci_suppl1_grps"] = s1grps all_dataframes["6 - CSCI_Suppl1_OE - csci_suppl1_oe"] = s1oe all_dataframes["7 - CSCI_Suppl2_OE - csci_suppl2_oe"] = s2oe ''' message = str(msgs) state = 0 except Exception as e: # here is where we email sccwrp to let them know we couldnt get csci score for sampleid - we still need load the data and try to load other sampleids bad_station = '\n CSCI Processing Failed on station %s:\n' % single_station[ 0] bad_group = 'Sample %s could not be processed because %s.\n' % ( bug_sample_id, e[0]) msgs.append('CSCI Error:\n') msgs.append(bad_station) msgs.append(bad_group) errorLog("CSCI ran into the following error: %s" % e[0]) msgs.append('Failed to run csci\n') all_dataframes["2 - CSCI_Core - csci_core"] = pd.read_csv( "/var/www/smc/logs/%s.core.csv" % TIMESTAMP) all_dataframes[ "3 - CSCI_Suppl1_MMI - csci_suppl1_mmi"] = pd.read_csv( "/var/www/smc/logs/%s.Suppl1_mmi.csv" % TIMESTAMP) all_dataframes[ "4 - CSCI_Suppl2_MMI - csci_suppl2_mmi"] = pd.read_csv( "/var/www/smc/logs/%s.Suppl2_mmi.csv" % TIMESTAMP) all_dataframes[ "5 - CSCI_Suppl1_GRPS - csci_suppl1_grps"] = pd.read_csv( "/var/www/smc/logs/%s.Suppl1_grps.csv" % TIMESTAMP) all_dataframes[ "6 - CSCI_Suppl1_OE - csci_suppl1_oe"] = pd.read_csv( "/var/www/smc/logs/%s.Suppl1_OE.csv" % TIMESTAMP) all_dataframes[ "7 - CSCI_Suppl2_OE - csci_suppl2_oe"] = pd.read_csv( "/var/www/smc/logs/%s.Suppl2_OE.csv" % TIMESTAMP) message = msgs errorLog(message) state = 0 for dataframe in all_dataframes.keys(): if 'custom_errors' in all_dataframes[dataframe]: custom_errors.append( getCustomErrors(all_dataframes[dataframe], dataframe, 'custom_errors')) custom_redundant_errors.append( getCustomRedundantErrors(all_dataframes[dataframe], dataframe, "custom_errors")) if 'custom_warnings' in all_dataframes[dataframe]: errorLog("custom_warnings") custom_errors.append( getCustomErrors(all_dataframes[dataframe], dataframe, 'custom_warnings')) errorLog(custom_warnings) custom_redundant_errors.append( getCustomRedundantErrors(all_dataframes[dataframe], dataframe, "custom_warnings")) custom_checks = json.dumps(custom_errors, ensure_ascii=True) custom_redundant_checks = json.dumps(custom_redundant_errors, ensure_ascii=True) ## END RETRIEVE ERRORS ## # get filenames from fileupload routine errorLog(message) #assignment_table = result.groupby(['stationid','lab','analyteclass']).size().to_frame(name = 'count').reset_index() # lets reassign the analyteclass field name to species so the assignment query will run properly - check StagingUpload.py for details #assignment_table = assignment_table.rename(columns={'analyteclass': 'species'}) return assignment_table, custom_checks, custom_redundant_checks, summary_checks, summary_results_link, message, result_unique_stations except ValueError: message = "Critical Error: Failed to run taxonomy checks" errorLog(message) state = 1 return jsonify(message=message, state=state)
def trt(df1, treated, control): # pandas df, treated columns, control columns df = df1.copy() combined = treated + control df[combined] = df[combined].astype(float) df = df.reset_index() del df['index'] names = {} ren = {} count = 1 tr = [] ct = [] for i in df.columns: col = 'column{}'.format(count) names[col] = i ren[i] = col if i in treated: tr.append(col) elif i in control: ct.append(col) count += 1 df.rename(columns=ren, inplace=True) c = "library(limma)" ro.r(c) c = "library(qvalue)" ro.r(c) ctot = ct + tr rdf = pandas2ri.py2ri(df) ro.globalenv['data'] = rdf c = "str(data)" c = "data[ is.na(data) ] <- NA" ro.r(c) c = "tr <- c{}".format(tuple(tr)) ro.r(c) #print c c = "ct <- c{}".format(tuple(ct)) ro.r(c) #print c c = "str(ct)" #print ro.r(c) #c='''source("http://www.biostat.jhsph.edu/~kkammers/software/eupa/source.functions.r")''' c = '''source("source.functions.r.txt")''' ro.r(c) c = 'data' design = [] [design.append(2) for i in tr] [design.append(1) for i in ct] c = "design <- model.matrix(~factor(c{}))".format(tuple(design)) #print c ro.r(c) #print ro.r('str(data)') c = '''colnames(design) <- c("Intercept", "Diff")''' ro.r(c) c = 'res.eb <- trt.fit(data[, c(tr,ct)], design)' ro.r(c) c = "res.eb" #print ro."r(c) dfebi = pandas2ri.ri2py(ro.r[c]) assert 'index' not in dfebi.columns dfebi.index = dfebi.reset_index()['index'].apply(int) df = df.join(dfebi) df = df.rename(columns=names) df = df[df['adj.P.Val'].notnull()] df = df.sort('adj.P.Val', ascending=True) return df
def ANOVA_RM(dataframe, val, var, subject): # pandas df, col to do anova on #c = "sink('{}')".format(outpath); ro.r(c) c = "library('FSA')" ro.r(c) c = "library('nlme')" ro.r(c) c = "require(multcomp)" ro.r(c) #var_dct = {} #count = 0 #for v in list(set(dataframe[var])): # var_dct[v] = str(count) # count += 1 #dataframe['tempvar'] = dataframe[var].apply(lambda x : var_dct[x]) rdf = pandas2ri.py2ri(dataframe) ro.globalenv['data'] = rdf #c = "library(plyr)";ro.r(c) try: c = "your.bartlett = bartlett.test(data${}~data${})".format(val, var) ro.r(c) c = "print(your.bartlett)" ro.r(c) bartlett = robjects.r('your.bartlett$p.value')[0] except: bartlett = np.nan try: c = "your.fligner = fligner.test(data${}~data${})".format(val, var) ro.r(c) c = "print(your.fligner)" ro.r(c) fligner = robjects.r('your.fligner$p.value')[0] except: fligner = np.nan #print bartlett, fligner try: c = "your.lme = lme({} ~ {}, data=data,random=~1|{}/{})".format( val, var, subject, var) ro.r(c) c = "your.anova = anova(your.lme)" ro.r(c) c = "your.anova" an = pandas2ri.ri2py(ro.r[c]) an = an.loc[var].to_dict() c = "your.sum = summary(glht(your.lme, linfct=mcp({} = 'Tukey')), test = adjusted(type = 'bonferroni'))".format( var) ro.r(c) c = "pvals = your.sum[10]$test$pvalues" ro.r(c) c = 'data.frame = data.frame(as.list(pvals))' ro.r(c) c = 'data.frame' df = pandas2ri.ri2py(ro.r[c]) df = df.reset_index() del df['index'] df.loc[0, 'Tukey correction'] = 'Bonferroni' for key in an: df.loc[0, 'ANOVA ' + key] = an[key] except: df = pd.DataFrame() return df
def import_data(out_itr, evalTime, categorical_columns=None, continuous_columns=None): """Preprocess the data to use them to the model to train, validate and predict Arguments: out_itr: indicator of set of 5-fold cross validation data out of 5 simulated dataset evalTime: Evaluation times categorical_columns: A list of name of the categorical columns in the dataframe continuous_columns: A list of name of the continuous columns in the dataframe Returns: All the attributes that will be used in the model to train, validate and predict """ ### Loading Data from the folder named as the dataset (Synthetic/WIHS/SEER) in the code directory train_df = pd.read_csv('Synthetic/train_data_{}.csv'.format(out_itr)) val_df = pd.read_csv('Synthetic/valid_data_{}.csv'.format(out_itr)) test_df = pd.read_csv('Synthetic/test_data_{}.csv'.format(out_itr)) #Create a column 'train' to trainining, validation and test data and combined them. Then convert the the categorical variables into dummy variables on combined data so that the number of columns in all three dataset remain equal. train_df['train'] = 1 val_df['train'] = 2 test_df['train'] = 3 df = pd.concat([train_df, val_df, test_df]) #Convert the categorical variables into dummy variables if categorical_columns is not None: df = to_one_hot(df, categorical_columns) train_data = df[df['train'] == 1] val_data = df[df['train'] == 2] test_data = df[df['train'] == 3] #Drop the 'train' column from all three datasets. train_data = train_data.drop(columns=['train']) val_data = val_data.drop(columns=['train']) test_data = test_data.drop(columns=['train']) #Standardize the contunuous columns if continuous_columns is not None: train_data = standardized(train_data, train_data, continuous_columns) val_data = standardized(train_data, val_data, continuous_columns) test_data = standardized(train_data, test_data, continuous_columns) #Full Dataset dataset = df.drop(columns=['train']) label = np.asarray(dataset[['status']]) time = np.asarray(dataset[['time']]) data = np.asarray(dataset.drop(columns=['status', 'time'])) num_Category = int(np.max(time) * 1.2) #to have enough time-horizon num_Event = int(len(np.unique(label)) - 1) #the number of events (excluding censoring as an event) num_evalTime = len(evalTime) #No. of evaluation times #Preprocess the Training Data tr_time = np.asarray(train_data[['time']]) tr_label = np.asarray(train_data[['status']]) eval_time = FloatVector(evalTime) #Convert the 'Python' dataframe to 'R' with localconverter(default_converter + pandas2ri.converter) as cv: train_data_pseudo = pandas2ri.py2ri(train_data) train_pseudo_data = get_conditional_pseudo_data(train_data_pseudo, eval_time) train_pseudo = pandas2ri.ri2py(train_pseudo_data) tr_data = train_pseudo.drop(['y'], axis=1) tr_data = np.asarray(tr_data) x_dim = np.shape(tr_data)[1] y_train = np.asarray(train_pseudo.loc[:, 'y']) #Preprocess the Validation Data va_time = np.asarray(val_data[['time']]) va_label = np.asarray(val_data[['status']]) #Convert the 'Python' dataframe to 'R' with localconverter(default_converter + pandas2ri.converter) as cv: val_data_pseudo = pandas2ri.py2ri(val_data) va_data = get_conditional_test_data(val_data_pseudo, eval_time) va_data = pandas2ri.ri2py(va_data) va_data = np.asarray(va_data) #Preprocess the Test Data te_time = np.asarray(test_data[['time']]) te_label = np.asarray(test_data[['status']]) #Convert the 'Python' dataframe to 'R' with localconverter(default_converter + pandas2ri.converter) as cv: test_data_pseudo = pandas2ri.py2ri(test_data) te_data = get_conditional_test_data(test_data_pseudo, eval_time) te_data = pandas2ri.ri2py(te_data) te_data = np.asarray(te_data) return tr_data, tr_time, tr_label, y_train, va_data, va_time, va_label, te_data, te_time, te_label, num_Category, num_Event, num_evalTime, x_dim
#fitting arima to find optimal params model = auto_arima(snp_returns_rolling) model.fit(snp_returns_rolling) #extracting p and q, require for feeding into garch model p_ = model.order[0] o_ = model.order[1] q_ = model.order[2] arma_order = str(tuple([p_, q_])) #fitting Garch Model garch_spec = rugarch.ugarchspec( mean_model=robjects.r( "list(armaOrder = c{arma_order})".format(arma_order=arma_order)), variance_model=robjects.r('list(garchOrder=c(1,1))'), distribution_model='std') pandas2ri.activate() r_dataframe = pandas2ri.py2ri(snp_returns_rolling) # Train R GARCH model on returns as % garch_fitted = rugarch.ugarchfit(garch_spec, r_dataframe, solver='hybrid') pandas2ri.deactivate() #forecasting next point fore = rugarch.ugarchforecast(garch_fitted, n_ahead=1) forecast = np.array(fore.slots['forecast'].rx2('seriesFor')).flatten()[0] #storing signal, signal is basically sign of the forecast of return forecasted_returns.append({ 'date': gspc_returns.index[window_length + d].date(), 'signal': np.sign(forecast) }) print(gspc_returns.index[window_length + d].date()) print(arma_order, forecast)
def MComBat(X, batch, ref_batch=None, covariate=None, num_covs=None, save_dir=None): # Check X if not isinstance(X, (pd.DataFrame, pd.Series)): if isinstance(X, (list, tuple, np.ndarray, Mapping)): df = pd.DataFrame(X) else: raise TypeError('X must be an array-like object, dictionary or pandas Dataframe/Series') else: df = X row_names = df.index r_df = pandas2ri.py2ri(df) # Check covariate if covariate is None: covariate = np.ones((len(batch), 1)) else: if not isinstance(covariate, (list, tuple, np.ndarray)): if isinstance(covariate, pd.DataFrame) or isinstance(covariate, pd.Series): covariate = covariate.to_numpy() else: raise TypeError('covariate array must be an array like or pandas Dataframe/Series') else: covariate = np.array(covariate) if len(covariate.shape) == 1: covariate = covariate.reshape(-1, 1) elif len(covariate.shape) > 2: raise ValueError('covariate array must be 1D or 2D') nr, nc = covariate.shape r_covariate = r.matrix(covariate, nrow=nr, ncol=nc) # Check batch if not isinstance(batch, (list, tuple, np.ndarray)): if isinstance(batch, pd.DataFrame) or isinstance(batch, pd.Series): batch = batch.to_numpy() else: raise TypeError('batch array must be an array like or pandas Dataframe/Series') else: batch = np.array(batch) if len(batch.shape) != 1: if len(batch.shape) == 2 and batch.shape[1] == 1: batch.reshape(-1) else: raise ValueError('batch array must be 1D or 2D with second dimension equal to 1') if len(np.unique(batch)) <= 1: raise ValueError('batch array must have at least 2 classes') r_batch = Vector(batch) # Check ref batch if ref_batch is None: ref_batch = np.unique(batch)[0] else: if ref_batch not in np.unique(batch): raise ValueError('ref_batch must be one of np.unique(batch) values') # Check numCovs if num_covs is None: r_numCovs = NULL else: if isinstance(num_covs, int): num_covs = [num_covs] if not isinstance(num_covs, (list, tuple, np.ndarray)): raise TypeError('num_covs must be an int or array like of int equal to the index of numerical covariates') r_numCovs = Vector(num_covs) # cwd = os.path.dirname(sys.argv[0]) cwd = os.path.dirname(os.path.abspath(__file__)) r.setwd(cwd) # r.source('./Statistical_analysis/R_scripts/MComBat.R') r.source('./R_scripts/MComBat.R') r_dr_results = r.MComBat_harmonization(r_df, r_covariate, r_batch, ref_batch, r_numCovs) R_object_dict = {} keys = r_dr_results.names for i in range(len(keys)): R_object_dict[keys[i]] = np.array(r_dr_results[i]) results = pd.DataFrame(R_object_dict) results.index = row_names if save_dir is not None: results.to_excel(os.path.join(save_dir, 'Feature_MComBat.xlsx')) return results
def r_cal_b(df): robjects.r(''' # create a function `f` f <- function(df, verbose=FALSE) { if (verbose) { cat("I am calling f().\n") } xMin<-min(df$x) xMax<-max(df$x) yMin<-min(df$y) yMax<-max(df$y) #xy_PPP <- with(df, ppp(x, y, c(-25,25), c(-25,25))) xy_PPP <- with(df, ppp(x, y, c(xMin,xMax), c(yMin,yMax))) plot(xy_PPP) xy=df summary(xy) xy <- unique(xy) xy<-data.matrix(xy) mc <- apply(xy, 2, mean) sd <- sqrt(sum((xy[,1] - mc[1])^2 + (xy[,2] - mc[2])^2) / nrow(xy)) buffer_area=25*25 dens <- nrow(xy) / buffer_area library(spatstat) win<-owin(c(-25,25), c(-25,25)) #library(devtools) #if (!require("rspatial")) devtools::install_github('rspatial/rspatial') #remotes::install_github("rspatial/rspatial") #devtools::install_github("rspatial/rspatial") #devtools::install_github("rstudio/sparkapi") #library(rspatial) #r <- raster(win) quadrat_C<-quadratcount(xy_PPP,nx=4,ny=4) #plot(quadrat_C) # number of quadrats quadrats <- sum(quadrat_C) f<-table(quadrat_C) f<-data.frame(f) # number of cases cases <- sum(as.integer(f$quadrat_C) * f$Freq) mu <- cases / quadrats ff <- data.frame(as.integer(f$quadrat_C),f$Freq) colnames(ff) <- c('K', 'X') ff$Kmu <- ff$K - mu ff$Kmu2 <- ff$Kmu^2 ff$XKmu2 <- ff$Kmu2 * ff$X s2 <- sum(ff$XKmu2) / (sum(ff$X)-1) VMR <- s2 / mu Fs<-Fest(xy_PPP) #plot(Fs) Gs<-Gest(xy_PPP) #plot(Gs) km<-Fs$km[10] newlist<-list(VMR,km) print(VMR) return(newlist) #return(VMR) }} ''') r_f = robjects.r['f'] pandas2ri.activate() r_DF = pandas2ri.py2ri(df[["x", "y"]]) res = r_f(r_DF) print("+" * 50) print(res) return res
def VST(dataset): return pandas2ri.ri2py(r.vst(pandas2ri.py2ri(dataset['rawdata'])))
def fit(self, X, y): from lnpy import linear as linear_models n_channels = self.n_channels # estimation a GAM using all dimensions is not working well; # thus we try to estimate the linear weights using a linear model # and then fit a GAM to the linear predictions for each channel from rpy2.robjects.packages import importr from rpy2.robjects import pandas2ri import rpy2.robjects as ro pandas2ri.activate() import pandas as pd try: import pandas.rpy.common as com com_available = True except BaseException: com_available = False mgcv = importr('mgcv') if self.linear_model is None: lin_model = linear_models.ARD(verbose=False) lin_model.fit(X, y) elif isinstance(self.linear_model, string_types): if self.linear_model.upper() == 'ARD': lin_model = linear_models.ARD(verbose=False) elif self.linear_model.upper() == 'RIDGE': lin_model = linear_models.Ridge(verbose=False) lin_model.fit(X, y) N = X.shape[0] w = np.copy(lin_model.get_weights()) m = w.shape[0] / n_channels chan_ind = np.reshape(np.arange(w.shape[0]), (m, n_channels), order='F') Yw_pred = np.zeros((N, n_channels)) for j in range(n_channels): # predictions for channel j Yw_pred[:, j] = np.dot(X[:, chan_ind[:, j]], w[chan_ind[:, j]]) # fit GAM YX = np.hstack((np.atleast_2d(y).T, Yw_pred)) df = pd.DataFrame(YX, columns=self.columns) if com_available: df_r = com.convert_to_r_dataframe(df) else: try: df_r = pandas2ri.py2ri(df) except BaseException: df_r = pandas2ri.pandas2ri(df) mod = self.model_string m = mgcv.gam(ro.r(mod), data=df_r, family='gaussian()', optimizer='perf') self._model = m self._linear_model = lin_model
nfly=0 for i,gtypeData in enumerate([dC, dEx1, dEx2]): gType = gTypes[i] for j, gtData in enumerate(gtypeData): if j>0: for k,data in enumerate(gtData): if data!='': prmValDict[dfLabels[0]].append(float(data)) else: prmValDict[dfLabels[0]].append(np.nan) prmValDict[dfLabels[1]].append(lbls[k]) prmValDict[dfLabels[2]].append(nfly) prmValDict[dfLabels[3]].append(gType) nfly+=1 df = pd.DataFrame(prmValDict, columns=dfLabels) descStats = pd.DataFrame(pandas2ri.ri2py(fsa.Summarize(statsFormula, data = pandas2ri.py2ri(df)))) descStatsLabels = list(descStats.columns) if 'nvalid' in descStatsLabels: descStatsLabels.pop(descStatsLabels.index('nvalid')) descStatsLabels.append('nvalid') descStats = descStats[descStatsLabels] print descStats ll = nparLD.f1_ld_f1(pandas2ri.py2ri(df['result']), pandas2ri.py2ri(df['label']),\ pandas2ri.py2ri(df['genotype']),\ pandas2ri.py2ri(df['flyNumber']), **{'description': 'FALSE',\ 'plot_RTE':'FALSE', 'order.warning':'FALSE', }) multiAnova = r_matrix_to_data_frame(ll.rx2('ANOVA.test'), True).round(5)
# SPeicher als R data frame from rpy2.robjects import pandas2ri from rpy2.robjects import r #import pandas.rpy.common as com import pandas as pd import numpy as np import matplotlib.pyplot as plt import datetime df_ts_index = pd.read_pickle('DataFrameOutput_pull_ts_comments_comp.pkl') #http://stackoverflow.com/questions/11586582/save-2d-numpy-array-to-r-file-format-using-rpy2 R_df = pandas2ri.py2ri(df_ts_index) r.assign("GH_data_df_pull_comp", R_df) r("save(GH_data_df_pull_comp, file='RObject_pull_comp_comments.gzip', compress=TRUE)" )
def RunWDModel(recent): pandas2ri.activate() # R package names packnames = ('forecast') if all(rpackages.isinstalled(x) for x in packnames): have_tutorial_packages = True else: have_tutorial_packages = False if not have_tutorial_packages: # import R's utility package utils = rpackages.importr('utils') # select a mirror for R packages utils.chooseCRANmirror(ind=1) # select the first mirror in the list if not have_tutorial_packages: # R vector of strings from rpy2.robjects.vectors import StrVector # file packnames_to_install = [ x for x in packnames if not rpackages.isinstalled(x) ] if len(packnames_to_install) > 0: utils.install_packages(StrVector(packnames_to_install)) # Import R packages forecast = importr('forecast') base = importr('base') # Model subset of data for particular station jump = 3 # skip weekend output = pd.DataFrame(columns=['count_diff', 'DateTime', 'Type', 'ID']) absent = [] errors = [] done = False for x in station_range: SepModel = Model(recent, x) if SepModel.valid is False: absent.append(x) continue SepModel.PreProcess(separate=True) # Ensure series length is only 2 Weeks SepModel.WD = SepModel.WD[:20 * 48] if (len(SepModel.WD) > 0): if done is False: # Only needs to be run once done = True WD_dates = SepModel.WD.index y = np.asarray(WD_dates[-1].year, dtype='datetime64[Y]') - 1970 doy = np.asarray((WD_dates[-1].dayofyear + jump), dtype='timedelta64[D]') - 1 new = pd.to_datetime(y + doy) new_dates = pd.DatetimeIndex(start=new, freq='30Min', periods=48 * 4) SepModel.WD.reset_index(inplace=True, drop=True) gc.collect() robjects.r('o = c(2,0,1)') robjects.r('sorder = c(1,1,2)') robjects.r('s = list(order=sorder, period=48)') DF = pandas2ri.py2ri(SepModel.WD) robjects.r.assign('df', DF) try: robjects.r('fit = Arima(df,order=o, seasonal=s, method="CSS")') except: errors.append(x) continue f_cast = robjects.r('f_cast = forecast(fit, h=4*48)') arima_mean = np.array(f_cast.rx('mean')) robjects.r('rm(list = ls(all = TRUE))') robjects.r('gc()') results = pd.DataFrame({ 'count_diff': arima_mean.flatten() }).round() results.count_diff = results.count_diff.astype(int) results['DateTime'] = new_dates results['Type'] = 'Forecast' results['ID'] = x SepModel.WD['DateTime'] = WD_dates SepModel.WD['Type'] = 'Historic' SepModel.WD['ID'] = x out = SepModel.WD.append(results) output = output.append(out) del f_cast del DF del SepModel gc.collect() output.ID = output.ID.astype(int) output.count_diff = output.count_diff.astype(int) output.reset_index(inplace=True, drop=True) path = wd + '\Model' if not os.path.exists(path): os.mkdir(path) output.to_csv(path + '\\' 'WDModelOutput.csv') return output, absent, errors
def main(): ''' maine ''' # Command Line Stuff... myCommandLine = CommandLine() outdir = myCommandLine.args['outDir'] group1 = myCommandLine.args['group1'] group2 = myCommandLine.args['group2'] batch = myCommandLine.args['batch'] matrix = myCommandLine.args['matrix'] prefix = myCommandLine.args['prefix'] formula = myCommandLine.args['formula'] # make the quant DF quantDF = pd.read_table(matrix, header=0, sep='\t', index_col=0) df = pandas2ri.py2ri(quantDF) #print(df.head()) # import formula formulaDF = pd.read_csv(formula,header=0, sep="\t",index_col=0) sampleTable = pandas2ri.py2ri(formulaDF) if "batch" in list(formulaDF): design = Formula("~ batch + condition") else: design = Formula("~ condition") #print(sampleTable) # import DESeq2 from rpy2.robjects.packages import importr import rpy2.robjects.lib.ggplot2 as ggplot2 methods = importr('methods') deseq = importr('DESeq2') grdevices = importr('grDevices') qqman = importr('qqman') dds = deseq.DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design) dds = deseq.DESeq(dds) cont = robjects.r["grep"]("condition",robjects.r['resultsNames'](dds),value=True) #print(cont) # get results; orient the results for groupA vs B res = deseq.results(dds, name=cont) # results with shrinkage resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm") resdf = robjects.r['as.data.frame'](res) R.assign('res', res) reslfc = robjects.r['as.data.frame'](resLFC) # plot MA and PC stats for the user plotMA = robjects.r['plotMA'] plotDisp = robjects.r['plotDispEsts'] plotPCA = robjects.r['plotPCA'] plotQQ = robjects.r['qq'] vsd = robjects.r['vst'](dds, blind=robjects.r['F']) # get pca data if "batch" in list(formulaDF): pcaData = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") else: print(vsd) pcaData = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T']) percentVar = robjects.r['attr'](pcaData, "percentVar") # arrange grdevices.pdf(file="./%s/%s_QCplots_%s_v_%s.pdf" % (outdir,prefix,group1,group2)) x = "PC1: %s" % int(percentVar[0]*100) + "%% variance" y = "PC2: %s" % int(percentVar[1]*100) + "%% variance" if "batch" in list(formulaDF): pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() else: pp = ggplot2.ggplot(pcaData) + \ ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \ ggplot2.geom_point(size=3) + \ robjects.r['xlab'](x) + \ robjects.r['ylab'](y) + \ ggplot2.theme_classic() + \ ggplot2.coord_fixed() pp.plot() plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results") #plotMA(res, main="MA-plot results") plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrrhinkage") #plotMA(resLFC, main="MA-plot LFCSrrhinkage") plotQQ(resdf.rx2('pvalue'), main="pvalue QQ") plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ") hh = ggplot2.ggplot(resdf) + \ ggplot2.aes_string(x="pvalue") + \ ggplot2.geom_histogram() + \ ggplot2.theme_classic() hh.plot() plotDisp(dds, main="Dispersion Estimates") grdevices.dev_off() lfcOut = "./%s/%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (outdir,prefix,group1,group2) resOut = "./%s/%s_%s_v_%s_deseq2_results.tsv" % (outdir,prefix,group1,group2) robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t") robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
def fit_model_death_rates(df): '''Fit model for the death rates and extract coefficients. Parameters ---------- df: dataframe Contains columns of the log of death rates, sdi, age_group_id, region_id, year_id Returns ------- feff_df: dataframe Contains the draws of coefficients of fixed effects. reff_df: dataframe Contains the draws of coefficients of random effects. ''' rpy2.robjects.globalenv['data'] = pandas2ri.py2ri(df) # Fit linear mixed-effects model in R, fixed effects on sdi, random effects # on age_group_id and region_id. rpy2.robjects.r(''' model = lmer(log_death_rate ~ 1 + sdi + (1|age_group_id) + (1|region_id), data, REML=F) capture.output(summary(model), file = paste("/ihme/forecasting/data/disaster/outputs/disaser_model.txt", sep="")) feff_means = fixef(model) reff_means = ranef(model) feff_var = attr(vcov(model), "x") reff_var = VarCorr(model) ''') # Extract the coefficients of fixed effects of the model. feffects = ['sdi'] param_list = ['intercept'] + feffects fixed_means = pandas2ri.ri2py(rpy2.robjects.globalenv['feff_means']) fixed_var = np.reshape(np.array(rpy2.robjects.globalenv['feff_var']), (len(param_list), len(param_list))) # Draw random samples from a multivariate normal distribution. feff_df = pd.DataFrame(np.random.multivariate_normal( \ fixed_means, fixed_var, 1000), \ index=np.arange(1000), \ columns=[param_list]) # Extract the coefficients of random effects of the model reffects = ['age_group_id', 'region_id'] reff_means = {} reff_vars = {} reff_df = {} for r in reffects: rpy2.robjects.r(''' reff_means_df = data.frame(reff_means${r}) reff_means_df['{r}_label'] = row.names(reff_means_df) '''.format(r=r)) reff_means[r] = pandas2ri.ri2py(rpy2.robjects.r['reff_means_df']) reff_means[r] = reff_means[r].rename(columns={'X.Intercept.': \ 'mean_{}'.format(r)}) reff_vars[r] = np.array(rpy2.robjects \ .r('attr(reff_var${r}, "stddev")'.format(r=r)))[0] reff_means[r]['se_{}'.format(r)] = reff_vars[r] # Draw random samples from a normal distribution. reff_df[r] = pd.DataFrame(np.random.normal( reff_means[r]['mean_{}'.format(r)], reff_means[r]['se_{}'.format(r)], size=(1000, len(reff_means[r]))), columns=reff_means[r]['{}_label'.format(r)]) \ .transpose() \ .reset_index() return feff_df, reff_df
def applymem(df): rdf = pandas2ri.py2ri(df) seasons = sort(list(df.columns.drop(['UF', 'isoweek'])))[:-1] # Discard 2009 season if present: seasons = sorted(set(seasons).difference(['SRAG2009'])) rseasons = ro.StrVector(seasons) ro.globalenv['df'] = rdf ro.globalenv['seasons'] = rseasons # # Method for obtaining typical time series evolution (default 2) # ro.globalenv['par.type.curve'] = 2 # # Method for obtaining pre/post-epidemic threshold (default 4) # ro.globalenv['par.type.threshold'] = 2 # # Method for obtaining intensity thresholds (default 4) # ro.globalenv['par.type.intensity'] = 2 # # Method for obtaining outbreak start and length (default 6) # ro.globalenv['par.type.other'] = 2 # # Total number of points to obtain pre/post-threshold (will take n/seasons from each) # ro.globalenv['par.n.max'] = 30 # # Confidence interval for modelled curve # ro.globalenv['par.level.curve'] = 0.90 # # Confidence interval for pre/post-thresold # ro.globalenv['par.level.threshold'] = 0.95 # # Quantiles for intensity thresholds # ro.globalenv['par.level.intensity'] = ro.FloatVector([0.40, 0.90, 0.975]) # # epimemrslt = ro.r('memmodel(i.data=subset(df, select=seasons), i.type.curve=par.type.curve,' + # 'i.type.threshold=par.type.threshold, i.type.intensity=par.type.intensity,' + # 'i.type.other=par.type.other, i.n.max=par.n.max, i.level.curve=par.level.curve,' + # 'i.level.threshold=par.level.threshold, i.level.intensity=par.level.intensity)') ro.globalenv['df'] = rdf ro.globalenv['seasons'] = rseasons ro.globalenv['par.type.curve'] = 2 ro.globalenv['par.n.max'] = 20 ro.globalenv['par.level.curve'] = 0.90 ro.globalenv['par.level.threshold'] = 0.90 epimemrslt = ro.r( 'memmodel(i.data=subset(df, select=seasons), i.type.curve=par.type.curve,' + 'i.n.max=par.n.max, i.level.curve=par.level.curve, i.level.threshold=par.level.threshold)' ) # Pre-epidemic threshold: epithreshold = pandas2ri.ri2py_dataframe( epimemrslt.rx2('pre.post.intervals')).loc[0, 2] typrealcurve = pandas2ri.ri2py_dataframe(epimemrslt.rx2('typ.real.curve')) # Check for seasons below threshold: dropseasons = set() for s in seasons: if df[s].max() < epithreshold: dropseasons.add(s) # Drop seasons below threshold and rerun algorithm: episeasons = list(seasons) if len(dropseasons) > 0 and len(dropseasons) < len(seasons): episeasons = sorted(list(set(seasons).difference(dropseasons))) ro.globalenv['episeasons'] = ro.StrVector(episeasons) # epimemrslt = ro.r('memmodel(i.data=subset(df, select=episeasons), i.type.curve=par.type.curve,' + # 'i.type.threshold=par.type.threshold, i.type.intensity=par.type.intensity,' + # 'i.type.other=par.type.other, i.n.max=par.n.max, i.level.curve=par.level.curve,' + # 'i.level.threshold=par.level.threshold, i.level.intensity=par.level.intensity)') epimemrslt = ro.r( 'memmodel(i.data=subset(df, select=episeasons), i.type.curve=par.type.curve,' + 'i.n.max=par.n.max, i.level.curve=par.level.curve, i.level.threshold=par.level.threshold)' ) # Store results in python dictionary of objects pyepimemrslt = {} rovector = [ ro.vectors.StrVector, ro.vectors.IntVector, ro.vectors.FloatVector, ro.vectors.Vector ] for name in epimemrslt.names: rdata = epimemrslt.rx2(name) if name == 'call': pyepimemrslt.update({name: str(rdata)}) elif type(rdata) in rovector: pyepimemrslt.update({name: pandas2ri.ri2py_vector(rdata)}) else: pyepimemrslt.update({name: pandas2ri.ri2py_dataframe(rdata)}) # typ.curve is the typical curve obtained from averaging over epimemic seasons with time rescaled # so that the start of the epidemic period coincides with mean.start pyepimemrslt['typ.curve'].rename(columns={ 0: 'baixo', 1: 'mediano', 2: 'alto' }, inplace=True) pyepimemrslt['typ.curve']['mediano'].fillna(0, inplace=True) pyepimemrslt['typ.curve']['baixo'] = pyepimemrslt['typ.curve'][ 'baixo'].where(pyepimemrslt['typ.curve']['baixo'] >= 0, other=0) pyepimemrslt['typ.curve']['baixo'] = pyepimemrslt['typ.curve']['baixo'].\ where( (-pyepimemrslt['typ.curve']['baixo'].isnull()), other=pyepimemrslt['typ.curve']['mediano']) pyepimemrslt['typ.curve']['alto'] = pyepimemrslt['typ.curve']['alto'].\ where((-pyepimemrslt['typ.curve']['alto'].isnull()), other=pyepimemrslt['typ.curve']['mediano']) pyepimemrslt['pre.post.intervals'].rename(index={ 0: 'pre', 1: 'post' }, inplace=True) # typ.real.curve is the typical curve without time shift, that is, respecting the original weeks from data # this curve is better to keep all seasons, not only the epidemic ones. pyepimemrslt['typ.real.curve'] = typrealcurve.copy() pyepimemrslt['typ.real.curve'].rename(columns={ 0: 'baixo', 1: 'mediano', 2: 'alto' }, inplace=True) pyepimemrslt['typ.real.curve']['mediano'].fillna(0, inplace=True) pyepimemrslt['typ.real.curve']['baixo'] = pyepimemrslt['typ.real.curve']['baixo'].\ where(pyepimemrslt['typ.real.curve']['baixo']>=0, other=0) pyepimemrslt['typ.real.curve']['baixo'] = pyepimemrslt['typ.real.curve']['baixo'].\ where( (-pyepimemrslt['typ.real.curve']['baixo'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano']) pyepimemrslt['typ.real.curve']['alto'] = pyepimemrslt['typ.real.curve']['alto'].\ where((-pyepimemrslt['typ.real.curve']['alto'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano']) newcols = {} for k, v in enumerate(episeasons): newcols[k] = str(v) + ' transladado' pyepimemrslt['moving.epidemics'].rename(columns=newcols, inplace=True) return pyepimemrslt
def quantile(dataset): return pandas2ri.ri2py(r.quantile(pandas2ri.py2ri( dataset['rawdata']))).set_index('gene_symbol')
counts_mat /= 1000000 counts_mat = counts_mat.round() counts_mat = counts_mat.astype(int) print(counts_mat.shape) print("breaking up counts mat...") n_chunks = 10 step_size = counts_mat.shape[0] // n_chunks for i in range(n_chunks): start_idx = i * step_size end_idx = (i + 1) * step_size if i == n_chunks - 1: end_idx = counts_mat.shape[0] chunk = counts_mat[start_idx:end_idx] print("chunk {}, {}".format(i, chunk.shape)) print("converting to r objects") chunk = pandas2ri.py2ri(chunk) print("converted") r_object_name = "chunk_{}.mat".format(i) r.assign(r_object_name, chunk) print("assigned") r("save({}, file='counts_{}_mat.gzip', compress=TRUE)".format( r_object_name, i)) print("saved") print() # new_store = pd.HDFStore('selected_counts.h5') # new_store['counts'] = counts_mat # new_store['accessions'] = accessions # new_store['gene_symbols'] = gene_symbols # new_store['labels'] = labels # new_store.close()
def gc_coverage_plot(contigs_file, contig_depth_table=False, samtool_depth_file=False, blast_file=False, column1=1, column2=2, main=False, highlight=False, taxonomy_file=False, output_prefix=False): if output_prefix[-1] != '/': output_prefix += '/' print("output_prefix", output_prefix) import os import shell_command import rpy2.robjects as robjects import rpy2.robjects.numpy2ri from pandas import DataFrame import pandas import rpy2 from rpy2.robjects import r from rpy2.robjects import pandas2ri pandas2ri.activate() if not main: main = os.path.basename(contigs_file) out, err, code = shell_command.shell_command( "infoseq -auto -only -Name -length -pgc %s > /tmp/gc.tab" % contigs_file) #print (out) #print (err) #print (code) if contig_depth_table: contig_depth = pandas.read_csv(contig_depth_table, sep='\t', names=["contig", "depth"]) #contig_depth = DataFrame(contig_depth, columns=['contig', 'depth']) #print (type(contig_depth["contig"])) #print (type(contig_depth)) robjects.r.assign('contigs_depth', pandas2ri.py2ri(contig_depth)) if taxonomy_file: with open(taxonomy_file, 'r') as f: contigs2taxon2count = {} for row in f: data = row.rstrip().split() contig = data[0] taxon = data[1] if contig not in contigs2taxon2count: contigs2taxon2count[contig] = {} contigs2taxon2count[contig][taxon] = 1 else: if taxon in contigs2taxon2count[contig]: contigs2taxon2count[contig][taxon] += 1 else: contigs2taxon2count[contig][taxon] = 1 contig2label = [] for contig in contigs2taxon2count: if len(contigs2taxon2count[contig]) > 1: # more than one taxon label = '' for taxon in contigs2taxon2count[contig]: label += '%s (%s) /' % (taxon, contigs2taxon2count[contig][taxon]) label = label[0:-2] else: label = list(contigs2taxon2count[contig].keys())[0] contig2label.append([contig, label]) label2freq = {} for contig in contig2label: if contig[1] not in label2freq: label2freq[contig[1]] = 1 else: label2freq[contig[1]] += 1 for contig in contig2label: if label2freq[contig[1]] <= 2: contig[1] = 'rare_taxon' df = DataFrame(contig2label, columns=['contig', 'label']) print(type(df["contig"])) print(type(df)) #m = m.astype(float) robjects.r.assign('contig_labels', pandas2ri.py2ri(df)) else: robjects.r.assign('contig_labels', False) if highlight: highlight_code = """ gc_coverage_table$color <- rep(rgb(1, 0, 0,0.5), length(gc_coverage_table[,1])) highlight_table <- read.table("%s", header=FALSE) m <- match(highlight_table[,1], gc_coverage_table$Name) gc_coverage_subset <- gc_coverage_table[m,] print("subset") print(m) gc_coverage_table[m,]$color<-rgb(0, 0, 1,0.5) """ % highlight highlight_code2 = """ m <- match(highlight_table[,1], gc_coverage_table_2m$Name) #print("subset m2") #print(m) gc_coverage_subset2 <- gc_coverage_table_2m[m,] """ else: highlight_code = '' highlight_code2 = '' if not blast_file: robjects.r(""" #library(Cairo) library(R.utils) library(ggplot2) if (exists("contigs_depth")==FALSE){ if (isGzipped("%s")){ #print('Gzipped file') all_depth <- read.table(gzfile('%s'), header=FALSE) }else{ #print('Not Gzipped') all_depth <- read.table('%s', header=FALSE) } contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median) colnames(contigs_depth) <- c('contig', 'depth') } #print(contigs_depth) #print(contig_labels) contigs_gc <- read.table("/tmp/gc.tab", header=TRUE) gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$contig),2]) #w<-which(gc_coverage_table$Length >=1000) #gc_coverage_table <- gc_coverage_table[w,] cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4] #print('cov biggest:') #print(cov_biggest) w <- which(gc_coverage_table[,4]< (4*cov_biggest)) gc_coverage_table_2m <- gc_coverage_table[w,] if (contig_labels != FALSE) { library(RColorBrewer) color_palette <- c('red', 'blue','green', brewer.pal(12,"Paired"), brewer.pal(12,"Set3")) m <- match(contig_labels$contig, gc_coverage_table$Name) gc_coverage_table$color <- rep("Unclassified", length(gc_coverage_table[,1])) gc_coverage_table$contig_alpha <- rep(0.5, length(gc_coverage_table[,1])) gc_coverage_table$color[m] <- as.character(contig_labels$label) gc_coverage_table$contig_alpha[m] <- rep(1,length(contig_labels$label)) w<-which(gc_coverage_table$Length >=1000) gc_coverage_table <- gc_coverage_table[w,] #w2 <- which(gc_coverage_table$color != "Chlamydiae") #gc_coverage_table$contig_alpha[w2] <- 0.7 svg("%sgc_cov_buble_test.svg", width = 12, height = 12) p6 <- ggplot(gc_coverage_table, aes(x = X.GC, y = coverage, size = Length, fill = color, colour = color, alpha = contig_alpha)) + geom_point(shape = 21) + ggtitle("Scaffold GC vs Depth") + labs(x = "GC (%%)", y = "Sequencing depth") + scale_size(range = c(1, 10)) p6 <- p6 + scale_fill_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])+ guides(color = guide_legend(override.aes = list(size=5))) p6 <- p6 + scale_colour_manual(values=color_palette[0:length(unique(gc_coverage_table$color))]) #print (max(gc_coverage_table$Length)) p6 <- p6 + scale_alpha_continuous(range=c(0.1, 1), limits=c(0.1,1)) #+ scale_alpha_continuous(range=c(0, max(gc_coverage_table$Length)), limits=c(0,max(gc_coverage_table$Length))) print(p6 + theme_bw()) dev.off() gc_coverage_table_2m$color <- rep("Unclassified", length(gc_coverage_table_2m[,1])) gc_coverage_table_2m$contig_alpha <- rep(0.5, length(gc_coverage_table_2m[,1])) gc_coverage_table_2m$color[m] <- as.character(contig_labels$label) gc_coverage_table_2m$contig_alpha[m] <- rep(1,length(contig_labels$label)) svg("%sgc_cov_buble_test_2m.svg", width = 12, height = 12) p6 <- ggplot(gc_coverage_table_2m, aes(x = X.GC, y = coverage, size = Length, fill = color, colour = color, alpha = contig_alpha)) + geom_point(shape = 21) + ggtitle("Scaffold GC vs Depth") + labs(x = "GC (%%)", y = "Sequencing depth") + scale_size(range = c(1, 10)) p6 <- p6 + scale_fill_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])+ guides(color = guide_legend(override.aes = list(size=5))) p6 <- p6 + scale_colour_manual(values=color_palette[0:length(unique(gc_coverage_table$color))]) #print (max(gc_coverage_table$Length)) p6 <- p6 + scale_alpha_continuous(range=c(0.1, 1), limits=c(0.1,1)) #+ scale_alpha_continuous(range=c(0, max(gc_coverage_table$Length)), limits=c(0,max(gc_coverage_table$Length))) print(p6 + theme_bw()) dev.off() }else{ #print('NO contig_labels') } write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F) %s svg("%sgc_cov_buble.svg", width = 12, height = 12) symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=T, bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth") if (any("gc_coverage_subset" %%in%% ls())) { symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=T, bg=gc_coverage_table$color, fg=gc_coverage_table$color, add = TRUE) l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset$Name) text(x=gc_coverage_subset[,3], y=gc_coverage_subset[,4], labels = l) }else{ print ('a') } dev.off() %s svg("%sgc_cov_buble_2m.svg", width = 12, height = 12) symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2], inches=1/3, ann=T, bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth") if (any("gc_coverage_subset" %%in%% ls())) { symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2], inches=1/3, ann=T, bg=gc_coverage_table_2m$color, fg=gc_coverage_table_2m$color, add = TRUE) l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset2$Name) text(x=gc_coverage_subset2[,3], y=gc_coverage_subset2[,4], labels = l) }else{ print ('a') } dev.off() """ % (samtool_depth_file, samtool_depth_file, samtool_depth_file, output_prefix, output_prefix, highlight_code, output_prefix, main, highlight_code2, output_prefix, main)) else: robjects.r(""" #library(Cairo) library(R.utils) if (isGzipped("%s")){ #print('Gzipped file') all_depth <- read.table(gzfile('%s'), header=FALSE) }else{ #print('Not Gzipped') all_depth <- read.table('%s', header=FALSE) } blast_file <- read.table("%s", header=FALSE, sep="\t")[,c(2,6)] contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median) contigs_gc <- read.table("/tmp/gc.tab", header=TRUE) gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$V1),2]) #w<-which(gc_coverage_table$Length >=1000) #gc_coverage_table <- gc_coverage_table[w,] gc_coverage_table$taxon <- blast_file[,2][match(gc_coverage_table$Name, blast_file[,1])] #print (is.na(gc_coverage_table$taxon)) gc_coverage_table$taxon <- as.character(gc_coverage_table$taxon) gc_coverage_table$taxon[is.na(gc_coverage_table$taxon)] <- 'undefined' gc_coverage_table$taxon <- as.factor(gc_coverage_table$taxon) write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F) svg("gc_cov_buble.svg", width = 12, height = 12,) symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth") dev.off() cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4] #print('cov biggest:') #print(cov_biggest) w <- which(gc_coverage_table[,4]< (4*cov_biggest)) gc_coverage_table_2m <- gc_coverage_table[w,] svg("gc_cov_buble_2m.svg", width = 12, height = 12,) symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2], inches=1/3, ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth") dev.off() """ % (samtool_depth_file, samtool_depth_file, samtool_depth_file, blast_file, main, main))
def main(group1=None, group2=None, outDir=None, inDir=None, formula=None): ''' main ''' R.assign('inDir', inDir) R.assign('outdir', outDir) R.assign('group1', group1) R.assign('group2', group2) print("Running DeSeq2....") print(group1 + " vs " + group2) # import from rpy2.robjects.packages import importr #kallisto processing libraries tximportData = importr('tximportData') tximport = importr('tximport') ensembldb = importr('ensembldb') EnsDb_Hsapiens_v86 = importr('EnsDb.Hsapiens.v86') #deseq methods = importr('methods') deseq = importr('DESeq2') #transcripts to gene, used in tximport R('edb <- EnsDb.Hsapiens.v86') R('tx2gene = transcripts(edb , columns=c("tx_id", "gene_name"),return.type="DataFrame")' ) # import formula formulaDF = pd.read_csv(formula, header=0, sep="\t") samples = formulaDF.samples.tolist() R.assign('samples', samples) sampleTable = pandas2ri.py2ri(formulaDF) R.assign('sampleTable', sampleTable) #locate kallisto files #would be faster to use kallito abundance.h5 files R('files <- file.path(inDir, samples, "abundance.tsv")') R('all(file.exists(files))') #tximport conversion to gene R('txi.kallisto <- tximport(files, type = "kallisto",tx2gene = tx2gene, txOut = FALSE,ignoreTxVersion=TRUE)' ) R('rownames(sampleTable) <- samples') #DESeq R('dds <- DESeqDataSetFromTximport(txi.kallisto, sampleTable, ~condition)') # R('colData(dds)$condition<-factor(colData(dds)$condition, levels=c(group1,group2))') R('dds_<-DESeq(dds)') R('res<-results(dds_)') R('res<-res[order(res$padj),]') # writing deseq2 results to a file Out = os.path.join(outDir, "%s_v_%s_deseq2_results.csv" % (group1, group2)) R.assign('Out', Out) R('write.csv(as.data.frame(res),file=Out)')
split = 95 # Showing the outlier fig, ax1 = plt.subplots() plt.plot(data['Adj Close'], 'k', linewidth=0.5) plt.title(f'{Ticker} Stock Price') ax1.axvspan(data.index[-split] - timedelta(days=+5), data.index[-1], alpha=0.2, color='red') plt.show() # This is where the break is data.index[-84] #plt.plot(data[:-75]); plt.plot(data) # ModelR0 is before the Outlier datatoR0 = data['Adj Close'][:-84] r_dataframe = pandas2ri.py2ri(datatoR0) modelR0 = autoarima(r_dataframe) print(coef(modelR0)) # ModelR1 is After the Outlier datatoR1 = data['Adj Close'] r_dataframe = pandas2ri.py2ri(datatoR1) modelR1 = autoarima(r_dataframe) print(coef(modelR1)) SIMLEN = 30 SIMTIM = 4000 # I will generate 1 with normal rets and 1 with out of sample returns. total is before the division. montecarlo0 = np.transpose([ np.array(asnumeric(simulate(modelR0, nsim=SIMLEN))) for i in range(SIMTIM) ])