Beispiel #1
0
    def fit(self, indep_vars, dep_var):
        ro.globalenv['train'] = pandas2ri.py2ri(indep_vars)
        ro.globalenv[dep_var.name] = pandas2ri.py2ri(dep_var)

        # Builds the parameters string
        param = utils.build_R_parameters(self.param)

        # In order to support neural networks from different packages it
        # was necessary to wrap their respective methods for the "fit" concept
        if self.param.get("algorithm") == "rprop+":
            formula = dep_var.name+"~"+"+".join(indep_vars.columns.tolist())
            ro.r("formula <- as.formula(%s)" % formula)
            return self.RReturn(
                ro.r("neuralnet(formula, data=train,%s)" % param),
                self.param.get("algorithm")
            )
        elif self.param.get("method") == "ADAPTgdwm":
            ro.r("fit <- newff(%s)" % param)
            return self.RReturn(
                ro.r(
                    "fit <- train(fit, train, %s, error.criterium='LMS',\
                    report=TRUE, show.step=1000, n.shows=100)" % dep_var.name
                ),
                self.param.get("method")
            )
    def predict(self,newX):
        if self.model is None:
            print('model must first be fitted')
            return None
        if not isinstance(newX, pandas.DataFrame):
            newX=pandas.DataFrame(newX,columns=['V%d'%i for i in range(newX.shape[1])])

        if self.modeltype=='poisson':
            robjects.globalenv['newX']=pandas2ri.py2ri(newX)
            robjects.r('newX=data.matrix(newX)')
            if self.lambda_preset is not None:
                # heuristic for whether we are using zipath()
                robjects.r('pred=predict(fit,newX)')
                pred=robjects.r('pred').squeeze()
            else:
                pred=mpath.predict_glmreg(self.model[self.model.names.index('fit')],
                                base.as_symbol('newX'),
                                which=self.lambda_which)
        elif self.modeltype=='ZINB' or self.modeltype=='ZIpoisson' :
            robjects.globalenv['newX']=pandas2ri.py2ri(newX)
            #robjects.r('newX=data.matrix(newX)')
            if self.lambda_preset is not None:
                # heuristic for whether we are using zipath()
                robjects.r('pred=predict(fit,newX)')
            else:
                robjects.r('pred=predict(fit$fit,newX,which=fit$lambda.which)')
            pred=robjects.r('pred').squeeze()


        return numpy.array(pred)
Beispiel #3
0
    def plotPairwiseCorrelations(self, outfile, subset=False):
        ''' use the R base pairs function to plot all pairwise
        correlations between the samples

        subset will randomly subset n rows to speed up plotting'''

        plotGGpairs = R('''
        function(df){

        write.table(df, file="%(outfile)s.tsv", sep="\t")

        colnames(df) <- gsub("-", "_", colnames(df))

        width <- height <-  length(colnames(df)) * 100

        png("%(outfile)s", width=width, height=height, units = "px")

        panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
          usr <- par("usr"); on.exit(par(usr))
          par(usr = c(0, 1, 0, 1))
          r <- abs(cor(x, y))
          txt <- format(c(r, 0.123456789), digits = digits)[1]
          txt <- paste0(prefix, txt)
          if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
          text(0.5, 0.5, txt, cex = cex.cor * r * 50)}

        panel.hist = function (x, ...) {
          par(new = TRUE)
          hist(x,
               breaks=30,
               col = "light blue",
               probability = TRUE,
               axes = FALSE,
               main = "")
          rug(x)}

        pairs(df, pch=20, cex=0.1,
              lower.panel = panel.smooth, upper.panel = panel.cor,
              diag.panel=panel.hist)

        dev.off()
        }''' % locals())

        if subset:
            if len(self.table.index) > subset:
                rows = random.sample(self.table.index, subset)
                r_counts = pandas2ri.py2ri(self.table.ix[rows])
            else:
                r_counts = pandas2ri.py2ri(self.table)
        else:
            r_counts = pandas2ri.py2ri(self.table)

        plotGGpairs(r_counts)
def SCCA_r(X,Y, n_components, pen):
	df_X = pd.DataFrame(X)
	df_Y = pd.DataFrame(Y)
	rmat_X = pandas2ri.py2ri(df_X)
	rmat_Y = pandas2ri.py2ri(df_Y)
	ri.globalenv['X'] = rmat_X
	ri.globalenv['Y'] = rmat_Y

	out = PMA.CCA(x=X, z=Y, K=n_components, niter =100, standardize=False, penaltyx=pen[0], penaltyz=pen[1])
	df_u = pandas2ri.ri2py(out[1])
	df_v = pandas2ri.ri2py(out[2])
	cors = pandas2ri.ri2py(out[15])
	
	loadings = (np.asmatrix(df_u), np.asmatrix(df_v))
	return loadings, cors
Beispiel #5
0
def RDCC(data):
    cols = data.columns
    data = pandas2ri.py2ri(data)
    rpy2.robjects.globalenv["data"] = data
    armaspec = (1, 1, 1, 1, 1, 1)

    rscript = """
                suppressMessages(library(rugarch))
                suppressMessages(library(rmgarch))
                ###data <- matrix(rnorm(2200),200,11)
                spec <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(%s, %s),submodel = NULL,
                                                         external.regressors = NULL, variance.targeting = FALSE),
                                   mean.model = list(armaOrder = c(%s, %s), external.regressors = NULL,
                                                     distribution.model = "norm", start.pars = list(), fixed.pars = list()))
                dccspec<-dccspec(uspec=multispec(replicate(11,spec)),dccOrder = c(%s,%s),distribution="mvnorm")
                dccgarch<-dccfit(dccspec,data = data)
                dccsimdata<-dccsim(dccgarch,n.sim=1000)
                dccgarch
                fitted(dccsimdata)
                 """ % (
        armaspec[0],
        armaspec[1],
        armaspec[2],
        armaspec[3],
        armaspec[4],
        armaspec[5],
    )

    print rscript
    b = r(rscript)
    b = pd.DataFrame(b)
    b.plot()
    # plt.show()
    return b
Beispiel #6
0
def computeMnnBatchCorrection(counts):
    """Computes batch correction to a list of batches (data frames)
    where each data frame represents a batch (animal for instance).
    The batch correction is computed using Scran::mnnCorrect()
    from Marioni et al.
    :param counts: a list of matrices of counts
    :return returns a list of batch corrected matrices of counts
    """
    pandas2ri.activate()
    as_matrix = r["as.matrix"]
    meta = [(x.index,x.columns) for x in counts]
    r_counts = [as_matrix(pandas2ri.py2ri(x)) for x in counts]
    RimportLibrary("scran")
    r_call = """
        function(counts) {
           norm_counts = do.call(mnnCorrect, c(counts, cos.norm.out=FALSE));
           return(lapply(norm_counts$corrected, as.data.frame))
        }
    """
    r_func = r(r_call)
    norm_counts = list()
    for i,x in enumerate(r_func(r_counts)):
        norm_c = pandas2ri.ri2py(x)
        norm_c.index = meta[i][0]
        norm_c.columns = meta[i][1]
        norm_counts.append(norm_c)
    pandas2ri.deactivate()
    return norm_counts
Beispiel #7
0
def computeSumFactors(counts, scran_clusters=True):
    """ Compute normalization factors
    using the deconvolution method
    described in Marioni et al.
    Returns the computed size factors as a vector.
    :param counts: a matrix of counts (genes as rows)
    :return returns the normalization factors a vector
    """
    n_cells = len(counts.columns)
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    scran = RimportLibrary("scran")
    as_matrix = r["as.matrix"]
    if scran_clusters and n_cells >= 50:
        r_clusters = scran.quickCluster(as_matrix(r_counts),
                                        min(n_cells/10, 10),
                                        method="igraph")
        min_cluster_size = min(Counter(r_clusters).values())
        sizes = list(range(min(int(min_cluster_size/4), 10), 
                           min(int(min_cluster_size/2), 50), 5))
        dds = scran.computeSumFactors(as_matrix(r_counts), 
                                      clusters=r_clusters, sizes=sizes)
    else:
        sizes = list(range(min(int(n_cells/4), 10), 
                           min(int(n_cells/2), 50), 5))
        dds = scran.computeSumFactors(as_matrix(r_counts), sizes=sizes)        
    pandas_sf = pandas2ri.ri2py(dds)
    pandas2ri.deactivate()
    return pandas_sf
Beispiel #8
0
def RCopula(data, sim):

    cols = data.columns
    data2 = pandas2ri.py2ri(data)
    data = np.array(data)
    rpy2.robjects.globalenv["data"] = data2
    rpy2.robjects.globalenv["N"] = N

    rscript = """ suppressMessages(library(copula))

                    nAssets <- ncol(data)
                    u <- pobs(data,N)
                    clayton.cop <- claytonCopula(2,dim=nAssets)
                    a <- fitCopula(clayton.cop,u,method="mpl")
                    y <- (rCopula(copula=claytonCopula(a@estimate,nAssets),n=N))
                    y"""

    print rscript

    b = r(rscript)

    ###########????????????
    for j in range(b.shape[1]):
        mean = np.mean(data[:, j])
        std = np.std(data[:, j])
        for i in range(b.shape[0]):
            b[i, j] = scipy.stats.norm.ppf(b[i, j], loc=mean, scale=std)
    pd.DataFrame(b).iloc[:, 1].plot()
    plt.show()
    exit()

    b = pd.DataFrame(b)
    return b
Beispiel #9
0
    def heatmap(self, plotfile):
        ''' plots a heatmap '''
        # to do: add option to parse design file and add coloured row for
        # variable specified in design file.

        plotHeatmap = R('''
        function(df){

        library("Biobase")
        library("RColorBrewer")
        library("gplots")

        hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
        png("%(plotfile)s", width=1000, height=1000, units="px")
        write.table(df, file="%(plotfile)s.tsv", sep="\t")
        heatmap.2(as.matrix(df),
                  col = hmcol, scale="none", trace="none", margin=c(18, 10),
                  dendrogram="column", cexCol=2,
                  labRow = "",
                  hclustfun = function(x) hclust(x, method = 'average'),
                  distfun = function(x) as.dist(1 - cor(t(x), method="spearman")))
        dev.off()
        }''' % locals())

        r_counts = pandas2ri.py2ri(self.table)

        plotHeatmap(r_counts)
Beispiel #10
0
def RCopulaGarch(data, sim):
    data = pandas2ri.py2ri(data)
    rpy2.robjects.globalenv["data"] = data
    rpy2.robjects.globalenv["simulations"] = sim

    rscript = """
                suppressMessages(library(rugarch))
                suppressMessages(library(rmgarch))

                data <- matrix(rnorm(2200),200,11)
                nassets <- ncol(data)
                nperiods <- 390
                simulations <- 5000

                spec <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 1),submodel = NULL,external.regressors = NULL, variance.targeting = FALSE),mean.model = list(armaOrder = c(1, 1), external.regressors = NULL,distribution.model = "norm", start.pars = list(), fixed.pars = list()))
                dccspec<-dccspec(uspec=multispec(replicate(ncol(data),spec)),dccOrder = c(1,1),distribution="mvnorm")
                mspec <-multispec(replicate(ncol(data),spec))
                cspec<-cgarchspec(mspec, VAR = FALSE, robust = FALSE, lag = 1, lag.max = NULL,lag.criterion = "AIC", external.regressors = NULL,robust.control = list(gamma = 0.25, delta = 0.01, nc = 10, ns = 500),dccOrder = c(1, 1), asymmetric = FALSE,distribution.model = list(copula = "mvt",method = "Kendall", time.varying = FALSE,transformation = "parametric"),start.pars = list(), fixed.pars = list())
                copgarch <- cgarchfit(cspec, data, spd.control = list(lower = 0.1, upper = 0.9, type = "pwm",kernel = "epanech"), fit.control = list(eval.se = TRUE, stationarity = TRUE,scale = FALSE), solver = "solnp", solver.control = list(), out.sample = 0,cluster = NULL, fit = NULL, VAR.fit = NULL, realizedVol = NULL)
                simfit <- cgarchsim(copgarch, n.sim = nperiods, n.start = 0, m.sim = simulations,startMethod = "sample", presigma = NULL, preresiduals = NULL,prereturns = NULL, preR = NULL, preQ = NULL, preZ = NULL, rseed = NULL,mexsimdata = NULL, vexsimdata = NULL, cluster = NULL, only.density = FALSE,prerealized = NULL)
                simdata <- fitted(simfit)

                t <- array(rep(nperiods*nassets*simulations),c(nperiods,nassets,simulations))
                for (i in 1:simulations) {
                  t[,,i] <- fitted(simfit,i)
                }
                t

                 """

    print rscript
    b = r(rscript)
    return b
Beispiel #11
0
def mca( distance_matrix, dim = 2 ):
    """ calculate MCA matrix using R's FactorMineR """

    # build up haplotype dataframe

    from fatools.lib.utils import acquire_R, release_R
    from rpy2 import robjects
    from rpy2.robjects import pandas2ri

    acquire_R()

    r_df = pandas2ri.py2ri(distance_matrix.H)
    robjects.globalenv['haplo_data'] = r_df
    marker_len = len(distance_matrix.H.columns)
    arguments = ','.join('as.factor(haplo_data[,%d])' % x
                    for x in range(1, marker_len + 1))
    robjects.r('haplo_df <- data.frame(%s)' % arguments)
    robjects.r('library(FactoMineR)')
    mca_res = robjects.r('MCA(haplo_df, graph=FALSE)')

    # get the individual coordinate
    coord = pandas2ri.ri2py(mca_res.rx('ind')[0].rx('coord')[0])

    release_R()

    return (coord, None)
def extract_typ_real_curve(df, discarded_seasons=None, wdw_method=2, lower_bound=5.0):

    seasons = sorted(list(df.columns.drop(['UF', 'epiweek'])))[:-1]
    seasons = sorted(set(seasons).difference(discarded_seasons))

    rdf = pandas2ri.py2ri(df)
    rseasons = ro.StrVector(seasons)

    ro.globalenv['df'] = rdf
    ro.globalenv['seasons'] = rseasons
    ro.globalenv['par.method'] = wdw_method
    ro.globalenv['par.type.curve'] = 2
    ro.globalenv['par.level.curve'] = 0.95
    epimemrslt = ro.r('t(apply(subset(df, select=seasons), 1, memci, i.type.curve=par.type.curve, ' +
                      'i.level.curve=par.level.curve))')

    # Pre-epidemic threshold:
    typrealcurve = pd.DataFrame(epimemrslt)

    # Store results in python dictionary of objects
    pyepimemrslt = {}
    # typ.real.curve is the typical curve without time shift, that is, respecting the original weeks from data
    # this curve is better to keep all seasons, not only the epidemic ones.
    pyepimemrslt['typ.real.curve'] = typrealcurve.copy()
    pyepimemrslt['typ.real.curve'].rename(columns={0: 'baixo', 1: 'mediano', 2: 'alto'}, inplace=True)
    pyepimemrslt['typ.real.curve']['mediano'].fillna(0, inplace=True)
    pyepimemrslt['typ.real.curve'].loc[pyepimemrslt['typ.real.curve']['baixo'] < 0, 'baixo'] = 0
    pyepimemrslt['typ.real.curve']['baixo'] = pyepimemrslt['typ.real.curve']['baixo']. \
        where((-pyepimemrslt['typ.real.curve']['baixo'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano'])
    pyepimemrslt['typ.real.curve']['alto'] = pyepimemrslt['typ.real.curve']['alto']. \
        where((-pyepimemrslt['typ.real.curve']['alto'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano'])

    return pyepimemrslt
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False):
    """Makes a call to DESeq2 with SCRAN to
    perform D.E.A. in the given
    counts matrix with the given conditions and comparisons.
    Returns a list of DESeq2 results for each comparison
    """
    results = list()
    n_cells = len(counts.columns)
    try:
        pandas2ri.activate()
        deseq2 = RimportLibrary("DESeq2")
        scran = RimportLibrary("scran")
        multicore = RimportLibrary("BiocParallel")
        multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))
        as_matrix = r["as.matrix"]
        # Create the R conditions and counts data
        r_counts = pandas2ri.py2ri(counts)
        cond = robjects.StrVector(conds)
        r_call = """
            function(r_counts) {
                sce = SingleCellExperiment(assays=list(counts=r_counts))
                return(sce)
            }
        """
        r_func = r(r_call)
        sce = r_func(as_matrix(r_counts))
        if scran_clusters:
            r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10))
            min_cluster_size = min(Counter(r_clusters).values())
            sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]]))
            sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True)
        else:
            sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]]))
            sce = scran.computeSumFactors(sce, sizes=sizes, positive=True)   
        sce = r.normalize(sce)
        dds = r.convertTo(sce, type="DESeq2")
        r_call = """
            function(dds, conditions){
                colData(dds)$conditions = as.factor(conditions)
                design(dds) = formula(~ conditions)
                return(dds)
            }
        """
        r_func = r(r_call)
        dds = r_func(dds, cond)
        dds = r.DESeq(dds)
        # Perform the comparisons and store results in list
        for A,B in comparisons:
            result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha)
            result = r['as.data.frame'](result)
            genes = r['rownames'](result)
            result = pandas2ri.ri2py_dataframe(result)
            # There seems to be a problem parsing the rownames from R to pandas
            # so we do it manually
            result.index = genes
            results.append(result)
        pandas2ri.deactivate()
    except Exception as e:
        raise e
    return results
Beispiel #14
0
def logCountsWithFactors(counts, size_factors):
    """ Uses the R package scater to log a matrix of counts (genes as rows)
    and a vector of size factor using the method normalize().
    :param counts: a matrix of counts (genes as rows)
    :param size_factors: a vector of size factors
    :return the normalized log counts (genes as rows)
    """
    columns = counts.columns
    indexes = counts.index
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    scater = RimportLibrary("scran")
    r_call = """
        function(counts, size_factors){
          sce = SingleCellExperiment(assays=list(counts=as.matrix(counts)))
          sizeFactors(sce) = size_factors
          sce = normalize(sce)
          norm_counts = logcounts(sce)
          return(as.data.frame(norm_counts))
        }
    """
    r_func = r(r_call)
    r_norm_counts = r_func(r_counts, size_factors)
    pandas_norm_counts = pandas2ri.ri2py(r_norm_counts)
    pandas_norm_counts.index = indexes
    pandas_norm_counts.columns = columns
    pandas2ri.deactivate()
    return pandas_norm_counts
    def predict(self, xtest):
        """Predicts class via majority vote.

        Parameters
        ----------
        xtest : pd.DataFrame
            features for test set
        """
        if new_pandas_flag:
            r_xtest = pandas2ri.py2ri(xtest)
        else:
            r_xtest = com.convert_to_r_dataframe(xtest)
        #r_xtest = pandas2ri.py2ri(xtest)
        pred = self.rf_pred(self.rf, r_xtest)
        if new_pandas_flag:
            #py_pred = pandas2ri.ri2py(pred)
            tmp_genes = pred[1]
            tmp_pred_class = pred[0]
            genes = pandas2ri.ri2py(tmp_genes)
            pred_class = pandas2ri.ri2py(tmp_pred_class)
        else:
            py_pred = com.convert_robj(pred)
            genes, pred_class = zip(*py_pred.items())
            #genes = com.convert_robj(tmp_genes)
            #pred_class = com.convert_robj(tmp_pred_class)
        tmp_df = pd.DataFrame({'pred_class': pred_class},
                              index=genes)
        tmp_df = tmp_df.reindex(xtest.index)
        tmp_df -= 1  # for some reason the class numbers start at 1
        return tmp_df['pred_class']
 def set_cv_fold(self, df):
     """Send which genes are valid test sets for each CV fold."""
     if new_pandas_flag:
         r_df = pandas2ri.py2ri(df)
     else:
         r_df = com.convert_to_r_dataframe(df)
     ro.globalenv['cvFoldDf'] = r_df
Beispiel #17
0
def conditionDESeq2(data_frame, header, alpha, res_dir):
    '''
    Perform DESeq2-based analysis of condition:time interaction
    dependent differential expression
    '''

    E.info("Differential expression testing for %s" % header)
    cols = data_frame.columns

    # py2ri requires activation
    pandas2ri.activate()
    counts = pandas2ri.py2ri(data_frame)

    des_times = ro.IntVector([x.split(".")[1] for x in cols])
    des_reps = ro.StrVector([x.split(".")[2] for x in cols])
    des_cond = ro.StrVector([x.split(".")[0] for x in cols])
    genes = ro.StrVector([x for x in data_frame.index])

    # setup counts table and design frame

    R('''suppressPackageStartupMessages(library("DESeq2"))''')
    R('''sink(file="/dev/null")''')
    R('''times <- as.factor(%s)''' % des_times.r_repr())
    R('''reps <- c(%s)''' % des_reps.r_repr())
    R('''condition <- c(%s)''' % des_cond.r_repr())
    R('''design <- data.frame(times, reps, condition)''')
    R('''counts <- data.frame(%s)''' % counts.r_repr())
    R('''genes <- c(%s)''' % genes.r_repr())
    R('''rownames(counts) <- genes''')
    R('''rownames(design) <- colnames(counts)''')

    # use DESeq() with LRT and reduced formula.  Use effect
    # size moderation

    R('''dds <- DESeqDataSetFromMatrix(countData=counts, '''
      '''colData=design, '''
      '''design=~reps + times + condition + times:condition)''')
    R('''dds <- DESeq(dds, test="LRT", '''
      '''reduced=~reps + times + condition, betaPrior=T)''')
    R('''res <- results(dds)[order(results(dds)$padj, na.last=T), ]''')
    R('''res.df <- data.frame(res)''')

    # generate dispersion and MA plots
    R('''png("%s/%s-dispersions.png")''' % (res_dir,
                                            header))
    R('''plotDispEsts(dds)''')
    R('''dev.off()''')

    R('''png("%s/%s-MAplot.png")''' % (res_dir,
                                       header))
    R('''plotMA(res, alpha=%0.3f, ylim=c(-5,5))''' % alpha)
    R('''dev.off()''')
    R('''sink(file=NULL)''')

    df = pandas2ri.ri2py(R['res.df'])

    return df
Beispiel #18
0
def treeCutting(infile,
                expression_file,
                cluster_file,
                cluster_algorithm,
                deepsplit=False):
    '''
    Use dynamic tree cutting to derive clusters for each
    resampled distance matrix
    '''
    wgcna_out = "/dev/null"

    E.info("loading distance matrix")

    df = pd.read_table(infile, sep="\t",
                       header=0, index_col=0)
    df = df.fillna(0.0)
    genes = df.index
    genes_r = ro.StrVector([g for g in genes])

    # py2ri requires activation
    pandas2ri.activate()
    rdf = pandas2ri.py2ri(df)

    R.assign("distance_data", rdf)
    R.assign("gene_ids", genes_r)

    R('''sink(file='%(wgcna_out)s')''' % locals())
    R('''suppressPackageStartupMessages(library("WGCNA"))''')
    R('''suppressPackageStartupMessages(library("flashClust"))''')
    E.info("clustering data by %s linkage" % cluster_algorithm)
    R('''rownames(distance_data) <- gene_ids''')
    R('''clustering <- flashClust(as.dist(distance_data),'''
      ''' method='%(cluster_algorithm)s')''' % locals())
    if deepsplit:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=T)''')
    else:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=F)''')

    R('''color_cut <- labels2colors(cluster_cut)''')
    R('''write.table(color_cut, file = '%(cluster_file)s','''
      '''sep="\t")''' % locals())
    R('''cluster_matched <- data.frame(cbind(rownames(distance_data),'''
      '''color_cut))''')
    R('''colnames(cluster_matched) = c("gene_id", "cluster")''')
    R('''cluster_matched <- data.frame(cluster_matched$gene_id,'''
      '''cluster_matched$cluster)''')
    R('''sink(file=NULL)''')

    cluster_frame = pandas2ri.ri2py(R["cluster_matched"])
    cluster_frame.columns = ['gene_id', 'cluster']
    cluster_frame.index = cluster_frame['gene_id']
    cluster_frame.drop(['gene_id'], inplace=True, axis=1)

    return cluster_frame
Beispiel #19
0
 def predict(self, indep_vars):
     ro.globalenv['test'] = pandas2ri.py2ri(indep_vars)
     ro.globalenv['fit'] = self.fitted_model
     if self.algorithm == "rprop+":
         return pandas2ri.ri2py(
             ro.r("compute(fit,test)$net.result")
         )
     elif self.algorithm == "ADAPTgdwm":
         return pandas2ri.ri2py(
             ro.r("sim(fit$net, test)")
         )
Beispiel #20
0
def computeNClusters(counts, min_size=20):
    """Computes the number of clusters
    from the data using Scran::quickCluster"""
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts.transpose())
    scran = RimportLibrary("scran")
    as_matrix = r["as.matrix"]
    clusters = scran.quickCluster(as_matrix(r_counts), min_size, method="igraph")
    n_clust = len(set(clusters))
    pandas2ri.deactivate()
    return n_clust
Beispiel #21
0
 def fit(self, relationship, df):
     """
     relationship: string of the form: a~b+c
     df: Pandas Dataframe
     """
     # Get R dataframe
     r_df = pandas2ri.py2ri(df)
     # Create linear fit
     fit = stats.lm(relationship, data=df)
     self.fit = fit
     python_fit = self.convert_fit_to_python(fit)
     return python_fit
Beispiel #22
0
def draw_survival_curves_mpl(fit, ax=None, title=None, colors=None, ms=80, alpha=1):
    """
    Takes an R survfit.
    """
    if ax is None:
        _, ax = plt.subplots(1, 1, figsize=(4, 3))
    s = base.summary(fit)
    tab = pd.DataFrame({v: s.rx2(v) for v in s.names 
                                    if len(s.rx2(v)) == len(s.rx2('time'))},
                       index=s.rx2('time'))
    call = pandas2ri.py2ri(fit.rx2('call')[2])
    
    groups = robjects.r.sort(robjects.r.c(*call.feature.unique()))
    
    if 'strata' not in tab:
        groups = [0]
        tab['strata'] = 1
    elif len(tab.strata.unique()) != len(groups):
        gg = list(call[call.event > 0].feature.unique())
        gg = [g for g in groups if g in gg]
        bg = [g for g in groups if g not in gg]
        groups = gg + bg
           
    for i, group in enumerate(groups):
        censoring = call[(call.event == 0) & (call.feature == group)].days
        surv = tab[tab.strata == (i + 1)].surv
        surv = surv.copy().set_value(0., 1.)
        surv = surv.sort_index()
        if surv.index[-1] < censoring.max():
            surv = surv.set_value(censoring.max(), surv.iget(-1)).sort_index()

        censoring_pos = get_markers(censoring, surv)
        ax.step(surv.index, surv, lw=3, where='post', alpha=alpha, label=group)
        if colors is not None:
            try:
                """fix for R-Python str-to-int conversion"""
                color = colors[group]
            except:
                color = colors[i]
            ax.lines[-1].set_color(color)
        if len(censoring_pos) > 0:
            ax.scatter(*zip(*censoring_pos), marker='|', s=ms,
                       color=ax.lines[-1].get_color())
        
    ax.set_ylim(0, 1.05)
    # ax.set_xlim(0, max(surv.index)*1.05)
    ax.set_xlim(0, max(call.days) * 1.05)
    ax.legend(loc='best')
    ax.set_ylabel('Survival')
    ax.set_xlabel('Years')
    if title:
        ax.set_title(title)
def computeNClusters(counts, min_size=20):
    """Computes the number of clusters
    from the data using Scran::quickCluster"""
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts.transpose())
    scran = RimportLibrary("scran")
    multicore = RimportLibrary("BiocParallel")
    multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))  
    as_matrix = r["as.matrix"]
    clusters = scran.quickCluster(as_matrix(r_counts), min_size)
    n_clust = len(set(clusters))
    pandas2ri.deactivate()
    return n_clust
Beispiel #24
0
    def transform(self, method="vst", inplace=True):
        '''
        perform transformation on counts table
        current methods are:
         - deseq2 variance stabalising transformation
         - deseq rlog transformation
        '''

        assert method in ["vst", "rlog"], ("method must be one of"
                                           "[vst, rlog]")

        method2function = {"vst": "varianceStabilizingTransformation",
                           "rlog": "rlog"}

        t_function = method2function[method]

        transform = R('''
        function(df){

        suppressMessages(library('DESeq2'))

        design = data.frame(row.names = colnames(df),
                            condition = seq(1, length(colnames(df))))

        dds <- suppressMessages(DESeqDataSetFromMatrix(
                 countData= df, colData = design, design = ~condition))

        transformed <- suppressMessages(%(t_function)s(dds))
        transformed_df <- as.data.frame(assay(transformed))

        return(transformed_df)
        }''' % locals())

        r_counts = pandas2ri.py2ri(self.table)
        df = pandas2ri.ri2py(transform(r_counts))
        # losing rownames for some reason during the conversion?!
        df.index = self.table.index

        if inplace:
            self.table = df
            # R replaces "-" in column names with ".". Revert back!
            self.table.columns = [x.replace(".", "-")
                                  for x in self.table.columns]
        else:
            tmp_counts = self.clone()
            tmp_counts.table = df
            tmp_counts.table.columns = [x.replace(".", "-")
                                        for x in tmp_counts.table.columns]
            return tmp_counts
Beispiel #25
0
def computeRLEFactors(counts):
    """ Compute normalization size factors
    using the RLE method described in EdgeR and returns then as a vector.
    :param counts: a matrix of counts (genes as rows)
    :return returns the normalization factors a vector
    """
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    edger = RimportLibrary("edgeR")
    as_matrix = r["as.matrix"]
    dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE")
    pandas_sf = pandas2ri.ri2py(dds)
    pandas_cm = pandas2ri.ri2py(r.colSums(counts))
    pandas2ri.deactivate()
    return pandas_sf * pandas_cm
Beispiel #26
0
def computeSizeFactors(counts):
    """ Computes size factors using DESeq
    for the counts matrix given as input (Genes as rows
    and spots as columns).
    Returns the computed size factors as a vector.
    :param counts: a matrix of counts (genes as rows)
    :return returns the normalization factors a vector
    """
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    deseq2 = RimportLibrary("DESeq2")
    dds = deseq2.estimateSizeFactorsForMatrix(r_counts)
    pandas_sf = pandas2ri.ri2py(dds)
    pandas2ri.deactivate()
    return pandas_sf
Beispiel #27
0
    def plotDendogram(self, plot_filename=None,
                      distance_method="euclidean",
                      clustering_method="ward.D2"):

        r_counts = pandas2ri.py2ri(self.table)

        makeDendogram = R('''
        function(counts){
          png("%(plot_filename)s")
          par(mar = c(1,4,1,1))
          plot(hclust(dist(t(counts), method = "%(distance_method)s"),
                      method = "%(clustering_method)s"), main="")
          dev.off()
        }''' % locals())

        makeDendogram(r_counts)
    def fit(self, xtrain, ytrain):
        """The fit method trains R's random forest classifier.

        NOTE: the method name ("fit") and method signature were choosen
        to be consistent with scikit learn's fit method.

        Parameters
        ----------
        xtrain : pd.DataFrame
            features for training set
        ytrain : pd.DataFrame
            true class labels (as integers) for training set
        """
        label_counts = ytrain.value_counts()
        if self.is_onco_pred and self.is_tsg_pred:
            sampsize = [label_counts[self.other_num],
                        label_counts[self.onco_num],
                        label_counts[self.tsg_num]]
        elif self.is_onco_pred:
            sampsize = [label_counts[self.other_num],
                        label_counts[self.onco_num]]
        elif self.is_tsg_pred:
            sampsize = [label_counts[self.other_num],
                        label_counts[self.tsg_num]]

        self.set_sample_size(sampsize)
        ytrain.index = xtrain.index  # ensure indexes match
        xtrain['true_class'] = ytrain

        # convert
        if new_pandas_flag:
            r_xtrain = pandas2ri.py2ri(xtrain)
        else:
            r_xtrain = com.convert_to_r_dataframe(xtrain)
        #ro.globalenv['trainData'] = r_xtrain
        self.rf = self.rf_fit(r_xtrain, self.ntrees, self.sample_size)
        r_imp = self.rf_imp(self.rf)  # importance dataframe in R
        if new_pandas_flag:
            self.feature_importances_ = pandas2ri.ri2py(r_imp)
        else:
            self.feature_importances_ = com.convert_robj(r_imp)
Beispiel #29
0
def deaDESeq2(counts, conds, comparisons, alpha, size_factors=None):
    """Makes a call to DESeq2 to
    perform D.E.A. in the given
    counts matrix with the given conditions and comparisons.
    Can be given size factors. 
    Returns a list of DESeq2 results for each comparison
    """
    results = list()
    try:
        pandas2ri.activate()
        deseq2 = RimportLibrary("DESeq2")
        multicore = RimportLibrary("BiocParallel")
        multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))
        # Create the R conditions and counts data
        r_counts = pandas2ri.py2ri(counts)
        cond = robjects.DataFrame({"conditions": robjects.StrVector(conds)})
        design = r('formula(~ conditions)')
        dds = r.DESeqDataSetFromMatrix(countData=r_counts, colData=cond, design=design)
        if size_factors is None:
            dds = r.DESeq(dds, parallel=True, useT=True, 
                          minmu=1e-6, minReplicatesForReplace=np.inf)
        else:
            assign_sf = r["sizeFactors<-"]
            dds = assign_sf(object=dds, value=robjects.FloatVector(size_factors))
            dds = r.estimateDispersions(dds)
            dds = r.nbinomWaldTest(dds)
        # Perform the comparisons and store results in list
        for A,B in comparisons:
            result = r.results(dds, contrast=r.c("conditions", A, B), 
                               alpha=alpha, parallel=True)
            result = r['as.data.frame'](result)
            genes = r['rownames'](result)
            result = pandas2ri.ri2py_dataframe(result)
            # There seems to be a problem parsing the rownames from R to pandas
            # so we do it manually
            result.index = genes
            results.append(result)
        pandas2ri.deactivate()
    except Exception as e:
        raise e
    return results
Beispiel #30
0
def Rtsne(counts, dimensions, theta=0.5, dims=50, perplexity=30, max_iter=1000):
    """Performs dimensionality reduction
    using the R package Rtsne"""
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    tsne = RimportLibrary("Rtsne")
    multicore = RimportLibrary("BiocParallel")
    multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))
    as_matrix = r["as.matrix"]
    tsne_out = tsne.Rtsne(as_matrix(counts), 
                          dims=dimensions, 
                          theta=theta, 
                          check_duplicates=False, 
                          pca=True, 
                          initial_dims=dims, 
                          perplexity=perplexity, 
                          max_iter=max_iter, 
                          verbose=False)
    pandas_tsne_out = pandas2ri.ri2py(tsne_out.rx2('Y'))
    pandas2ri.deactivate()
    return pandas_tsne_out
Beispiel #31
0
# Extract required arguments.
data = pd.read_table(snakemake.input.data, index_col=0)  # Input Gene-by-Sample raw count data.
condition = pd.read_table(snakemake.input.condition, index_col=0, names=['condition'])  # Input condition file which indicates to which condition each sample belongs.
logger.info('%d(genes) x %d(samples) data matrix and %d sample conditions are given.' % (data.shape[0], data.shape[1], len(condition.index)))
logger.debug('Headers: %s...' % ' '.join(data.columns[:3]))
logger.debug('Gene identifiers: %s...' % ' '.join(data.index[:3]))

intersecting_samples = [sample for sample in data.columns if sample in condition.index]
data = data[intersecting_samples]

condition = list(condition.loc[intersecting_samples].condition.values)
logger.info('%d samples will be used for DEG discovery.' % len(intersecting_samples))


r_data_matrix = r['data.matrix'](pandas2ri.py2ri(data))
r_samples = r.colnames(r_data_matrix)
r_conditions = ro.FactorVector(condition)

logger.debug('Computing size factors.')
r_size_factors = ebseq.MedianNorm(r_data_matrix)

logger.info('Discovering DEGs.')
logger.info('Running EBTest.')

num_iteration = 0
while True:
    # Increase iteration numbers if the conditons are not met.
    # Hopefully most of the tie, 10 iterations will be enough for convergence.
    num_iteration += 10
    r_eb_out = ebseq.EBTest(Data=r_data_matrix, Conditions=r_conditions, sizeFactors=r_size_factors, maxround=num_iteration)
# %%
# %load_ext rpy2.ipython

# %%
# Convert data to R format
from rpy2.robjects import pandas2ri

R_data = no_repeat_data >> select(
    X["Subject ID"],
    X["Condition"],
    X["Stimulus Type"],
    X["Switch Rate"],
    X["Rating"],
)
R_data = pandas2ri.py2ri(R_data)
R_data.head()

# %% {"magic_args": "-i R_data -o anova_model,anova_model_summary,ref_poly,poly_contrasts", "language": "R"}
# library(afex)
# library(lsmeans)
# # afex_options(emmeans_model = "multivariate")
# # afex_options("emmeans_mode")
# # model = afex_options("emmeans_model")
#
# # Convert to numeric, due to pandas converting everything to strings
# R_data <- transform(R_data, Subject.ID = as.numeric(Subject.ID))
# R_data <- transform(R_data, Rating = as.numeric(Rating))
# R_data <- transform(R_data, Switch.Rate = as.numeric(Switch.Rate))
# anova_model <- aov_ez(
#     "Subject.ID",
Beispiel #33
0
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir     = myCommandLine.args['outDir']
    group1     = myCommandLine.args['group1']
    group2     = myCommandLine.args['group2']
    batch      = myCommandLine.args['batch']  
    matrix     = myCommandLine.args['matrix']
    prefix     = myCommandLine.args['prefix']
    formula    = myCommandLine.args['formula']
    threads    = myCommandLine.args['threads']

    print("running DRIMSEQ %s" % prefix, file=sys.stderr)

    # import
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods   = importr('methods')
    drim      = importr('DRIMSeq')

    # get quant table and formula table
    quantDF  = pd.read_table(matrix, header=0, sep='\t', index_col=0)
    df       = pandas2ri.py2ri(quantDF)

    formulaDF = pd.read_csv(formula,header=0, sep="\t")

    pydf      = pandas2ri.py2ri(formulaDF)

    # Convert pandas to R data frame.
    samples = pydf
    counts  = df

    # DRIMSEQ part.
    # Forumla
    if "batch" in list(formulaDF): R.assign('batch', samples.rx2('batch'))
    R.assign('condition', samples.rx2('condition'))
    R.assign('counts', counts)
    R.assign('samples',samples)
    R.assign('numThread', threads)
    R.assign("cooef", "condition%s" % group2)
    R('data <- dmDSdata(counts = counts, samples = samples)')
    R('filtered <- dmFilter(data, min_samps_gene_expr = 6, min_samps_feature_expr = 3, min_gene_expr = 15, min_feature_expr = 5)')
    if "batch" in list(formulaDF): 
        R('design_full <- model.matrix(~ condition + batch, data = samples(filtered))')
    else: 
        R('design_full <- model.matrix(~ condition, data = samples(filtered))')
    R('set.seed(123)')

    R('d <- dmPrecision(filtered, design = design_full, BPPARAM=BiocParallel::MulticoreParam(numThread))')
    R('d <- dmFit(d, design = design_full, verbose = 1, BPPARAM=BiocParallel::MulticoreParam(numThread))')
    
    R('contrast <- grep("condition",colnames(design_full),value=TRUE)')

    R('d <- dmTest(d, coef = contrast, verbose = 1, BPPARAM=BiocParallel::MulticoreParam(numThread))')
    res = R('merge(proportions(d),results(d,level="feature"), by=c("feature_id","gene_id"))')

    data_folder = os.path.join(os.getcwd(), outdir)
    resOut = os.path.join(data_folder, "%s_%s_v_%s_drimseq2_results.tsv"  % (prefix,group1,group2))
    

    res.to_csv(resOut, sep='\t')
    sys.exit(1)

    R('library(stageR)')
    R('pScreen <- results(d)$pvalue')
    R('names(pScreen) <- results(d)$gene_id')
    ## Assign transcript-level pvalues to the confirmation stage
    R('pConfirmation <- matrix(results(d, level = "feature")$pvalue, ncol = 1)')
    R('rownames(pConfirmation) <- results(d, level = "feature")$feature_id')
    ## Create the gene-transcript mapping
    R('tx2gene <- results(d, level = "feature")[, c("feature_id", "gene_id")]')
    ## Create the stageRTx object and perform the stage-wise analysis
    R('stageRObj <- stageRTx(pScreen = pScreen, pConfirmation = pConfirmation, pScreenAdjusted = FALSE, tx2gene = tx2gene)')
    R('stageRObj <- stageWiseAdjustment(object = stageRObj, method = "dtu", alpha = 0.05)')
    R('getSignificantGenes(stageRObj)')
    R('getSignificantTx(stageRObj)')
    R('padj <- getAdjustedPValues(stageRObj, order = TRUE, onlySignificantGenes = FALSE)')
    R('head(padj)')
 print "++++++++++++++++++++++++ tSeries ",colId, param, "++++++++++++++++++++++++"
 df1 = pd.DataFrame(columns=grpLbls)
 prmValDict = {x:[] for i,x in enumerate(grpLbls)}
 nFly=0
 for g_,gtype in enumerate(genotypes):
     dSets = [[x[colId] for i_,x in enumerate(pooledTotalDataTmSrs[gtype][tmPt])] for tmPt in xrange(nTmpts)]
     for i,d in enumerate(dSets):
         flyNum = list(np.arange(nFly,nFly+len(d)))
         tPts =  list(np.zeros(len(d))+i)
         gtypeList = [gtype for x in xrange(len(d))]
         dfData = {'result':d, 'timePoint': tPts,'flyNumber': flyNum, 'genotype': gtypeList}
         for j,l in enumerate(grpLbls):
             prmValDict[l].extend(dfData[l])
     nFly+=len(d)
 df = pd.DataFrame(prmValDict, columns=grpLbls)
 descStats = pd.DataFrame(pandas2ri.ri2py(fsa.Summarize(statsFormula, data = pandas2ri.py2ri(df))))
 ll = nparLD.f1_ld_f1(pandas2ri.py2ri(df['result']),
                      pandas2ri.py2ri(df['timePoint']),\
                      pandas2ri.py2ri(df['genotype']),\
                      pandas2ri.py2ri(df['flyNumber']),
                      **{'description': 'FALSE',\
                          'plot_RTE':'FALSE',
                          'order.warning':'FALSE',
                          })
 pdWald = r_matrix_to_data_frame(ll.rx2('Wald.test'), getLabels = True)
 pdAnova = r_matrix_to_data_frame(ll.rx2('ANOVA.test'), getLabels = True)
 pdPairComp = r_matrix_to_data_frame(ll.rx2('pair.comparison'), getLabels = False)
 #print ('Wald test\n%r'%pdWald)
 #print ('ANOVA test\n%r'%pdAnova)
 #print ('Pariwise Comparison\n%r'%pdPairComp)
 tmPts = [str(i) for i in xrange(nTmpts)]
Beispiel #35
0
    last_s2oe_id = 0
    s2oe_sql = "SELECT objectid FROM tmp_csci_suppl2_oe WHERE objectid IN (SELECT MAX(objectid) FROM tmp_csci_suppl2_oe)"
    s2oe_result = smc_eng.execute(s2oe_sql)
    for c in s2oe_result:
        last_s2oe_id = c[0]

    # group stationcode to get just one
    single_station = group.StationCode.unique()
    # to do - check to make sure there is only one but there should only be one
    print "stations_grouped: %s" % single_station[0]

    # find stationcode that matches between the bug record and what is in stations
    station = stations.loc[stations['stationcode'] == single_station[0]]
    # convert station to r dataframe
    station = pandas2ri.py2ri(station)

    # copy of group
    group_copy = group

    # make pandas dataframe to r dataframe
    group = pandas2ri.py2ri(group)

    # Only run cleandata if every FinalID in group is found in master list (metadata.rdata in BMIMetrics/inst). -Jordan 5/6/2019
    finalids = set(pandas2ri.ri2py(loadMetaData()).FinalID.tolist())
    group_ids = set(group_copy.FinalID.tolist())
    unknown_ids = group_ids - finalids
    if unknown_ids:
        errorLog("Sample %s has the following unknown FinalIDs: %s" %
                 (bug_sample_id, list(unknown_ids)))
        msgs.append("Sample % has the following unknown FinalIDs:\n")
Beispiel #36
0
def cellcyclemodel(data, datarange, bins):
    global g1modelr
    global g2modelr
    histogram = np.histogram(data, range=datarange, bins=bins)
    histogramx = solim._middler(histogram[1])
    histogramy = histogram[0]
    xdis = histogramx[1] - histogramx[0]
    #find G1 max
    g1maxy = np.max(histogramy)
    g1maxset = np.where(histogramy > (0.8 * g1maxy))
    g1maxleft = g1maxset[0][0]
    g1maxright = g1maxset[0][-1]
    g1maxxin = np.argmax(histogramy)
    g1maxx = histogramx[g1maxxin]
    g1mean = (g1maxleft + 1 + (g1maxright - g1maxleft) / 2) * xdis
    robjects.r(f'g1maxy = {g1maxy}')
    robjects.r(f'g1mean = {g1mean}')
    #pick valleydivide
    valleydivide = float(
        input(
            'type the x-coordinate of what looks to be the low point between the two peaks: '
        ))
    valleydividein = np.abs(histogramx - valleydivide).argmin()
    valleydivide = histogramx[valleydividein]
    valleyheight = histogramy[valleydividein]
    robjects.r(f'valleyheight = {valleyheight}')
    robjects.r(f'valleydivide = {valleydivide}')
    #find G2 max
    g2maxy = np.max(histogramy[histogramx > valleydivide])
    g2maxxin = int(2 * (g1maxleft + 1 + (g1maxright - g1maxleft) / 2))
    g2maxx = histogramx[g2maxxin]
    robjects.r(f'g2maxy = {g2maxy}')
    robjects.r(f'g2mean = {g2maxx}')
    #pick bigdivide
    bigdivide = float(
        input(
            'type the x-coordinate of where you think the upper boundary of the G2 peak is: '
        ))
    bigdividein = np.abs(histogramx - bigdivide).argmin()
    bigdivide = histogramx[bigdividein]
    #pick debrisdivide
    debrisdivide = float(
        input(
            'type the x-coordinate of where you think the boundary between the G1 peak and debris is: '
        ))
    debrisdividein = np.abs(histogramx - debrisdivide).argmin()
    debrisdivide = histogramx[debrisdividein]
    #find g1 modeldata
    g1startsd = np.std(data[np.logical_and(
        debrisdivide < data, data <
        (debrisdivide + (g1maxx - debrisdivide) * 2))])
    robjects.r(f'g1startsd = {g1startsd}')
    g1modelx = histogramx[debrisdividein:g1maxright +
                          (g1maxleft - debrisdividein) + 1]
    g1modely = histogramy[debrisdividein:g1maxright + 1]
    g1modelyright = np.array(
        list(reversed(histogramy[debrisdividein:g1maxleft])))
    g1modely = np.concatenate((g1modely, g1modelyright))
    g1model = pd.DataFrame({'x': g1modelx, 'y': g1modely})
    g1modelr = pandas2ri.py2ri(g1model)
    #find g2 modeldata
    g2startsd = np.std(data[np.logical_and(
        valleydivide < data, data <
        (valleydivide + (g2maxx - valleydivide) * 2))])
    robjects.r(f'g2startsd = {g2startsd}')
    g2modelx = histogramx[g2maxxin - (bigdividein - g2maxxin):bigdividein + 1]
    g2modely = histogramy[g2maxxin:bigdividein + 1]
    g2modely = np.concatenate(
        [np.array(list(reversed(g2modely[1:]))), g2modely])
    g2model = pd.DataFrame({'x': g2modelx, 'y': g2modely})
    g2modelr = pandas2ri.py2ri(g2model)
    fulldata = pd.DataFrame({'x': histogramx, 'y': histogramy})
    fulldatar = pandas2ri.py2ri(fulldata)
    modelwindow = pd.DataFrame({
        'x': histogramx[g1maxxin:g2maxxin + 1],
        'y': histogramy[g1maxxin:g2maxxin + 1]
    })
    modelwindowr = pandas2ri.py2ri(modelwindow)
    rinterface.globalenv.do_slot_assign('modelwindowr', modelwindowr)
    robjects.r('modelwindowr = attr(globalenv(), "modelwindowr")')
    rinterface.globalenv.do_slot_assign('fulldatar', fulldatar)
    robjects.r('fulldatar = attr(globalenv(), "fulldatar")')
    rinterface.globalenv.do_slot_assign('g1modelr', g1modelr)
    robjects.r('g1modelr = attr(globalenv(), "g1modelr")')
    rinterface.globalenv.do_slot_assign('g2modelr', g2modelr)
    robjects.r('g2modelr = attr(globalenv(), "g2modelr")')
    robjects.r("""
    library(purrr)
    dnormmodel = function(data, params, mean){
      return(params['height']*dnorm(x=data$x, sd=params['sd'], mean=mean))
    }
    g1model = partial(dnormmodel, mean=g1mean)
    g2model = partial(dnormmodel, mean=g2mean)
    measure_distance = function(params, data, model){
      diff = data$y - abs(model(data, params))
      return(sqrt(mean(diff^2)))
    }
    g1startheight = g1maxy / dnorm(0, sd=g1startsd)
    g2startheight = g2maxy / dnorm(0, sd=g2startsd)
    g1startparams = c(height=g1startheight, sd=g1startsd)
    g2startparams = c(height=g1startheight, sd=g2startsd)
    g1res = optim(g1startparams, measure_distance, data = g1modelr, model=g1model)
    g2res = optim(g1startparams, measure_distance, data = g2modelr, model=g2model)
    g1est = g1model(data=fulldatar, params=g1res$par)
    g2est = g2model(data=fulldatar, params=g2res$par)
    sest = fulldatar$y - (g1est + g2est)
        """)
    return (robjects.r('g1res$par'), robjects.r('g2res$par'),
            robjects.r('g1est'), robjects.r('g2est'), robjects.r('sest'),
            histogramy, histogramx)
motif1 = pd.DataFrame()
motif2 = pd.DataFrame()
motif3 = pd.DataFrame()
motif4 = pd.DataFrame()
for tf in TFs:
	mo=os.popen("sed -n '%ip' %stf_ru_max_top4_rank_largespace/%s "%(int(start1+1),path1,tf))
	m=re.split('\t|\n',mo.read())
	m.pop()
	m=list(map(float,m))
	motif1[tf]=m[:8]
	motif2[tf]=m[8:16]
	motif3[tf]=m[16:24]
	motif4[tf]=m[24:]


motif1_r=pandas2ri.py2ri(motif1)
motif2_r=pandas2ri.py2ri(motif2)
motif3_r=pandas2ri.py2ri(motif3)
motif4_r=pandas2ri.py2ri(motif4)

robjects.r['plot_motif'](pair,motif1_r,pair+'_'+'chr'+str(chro)+'_'+str(start)+'_tf_top1')
robjects.r['plot_motif'](pair,motif2_r,pair+'_'+'chr'+str(chro)+'_'+str(start)+'_tf_top2')
robjects.r['plot_motif'](pair,motif3_r,pair+'_'+'chr'+str(chro)+'_'+str(start)+'_tf_top3')
robjects.r['plot_motif'](pair,motif4_r,pair+'_'+'chr'+str(chro)+'_'+str(start)+'_tf_top4')






Beispiel #38
0
    def transform(self, method="vst", design=None, inplace=True, blind=True):
        '''
        perform transformation on counts table
        current methods are:
         - deseq2 variance stabalising transformation
        - deseq rlog transformation

        Need to supply a design table if not using "blind"
        '''

        assert method in ["vst", "rlog"], ("method must be one of"
                                           "[vst, rlog]")

        method2function = {
            "vst": "varianceStabilizingTransformation",
            "rlog": "rlog"
        }

        t_function = method2function[method]

        r_counts = pandas2ri.py2ri(self.table)

        if not blind:
            assert design, ("if not using blind must supply a design table "
                            "(a CGAT.Expression.ExperimentalDesign object")

            # currently this only accepts "~group" design
            transform = R('''
            function(df, design){

            suppressMessages(library('DESeq2'))

            dds <- suppressMessages(DESeqDataSetFromMatrix(
                     countData= df, colData = design, design = ~group))

            transformed <- suppressMessages(%(t_function)s(dds, blind=FALSE))
            transformed_df <- as.data.frame(assay(transformed))

            return(transformed_df)
            }''' % locals())

            r_design = pandas2ri.py2ri(design.table)
            df = pandas2ri.ri2py(transform(r_counts, r_design))

        else:

            transform = R('''
            function(df){

            suppressMessages(library('DESeq2'))

            design = data.frame(row.names = colnames(df),
                                group = seq(1, length(colnames(df))))

            dds <- suppressMessages(DESeqDataSetFromMatrix(
                     countData= df, colData = design, design = ~group))

            transformed <- suppressMessages(%(t_function)s(dds, blind=TRUE))
            transformed_df <- as.data.frame(assay(transformed))

            return(transformed_df)
            }''' % locals())

            df = pandas2ri.ri2py(transform(r_counts))

        # losing rownames for some reason during the conversion?!
        df.index = self.table.index

        if inplace:
            self.table = df
            # R replaces "-" in column names with ".". Revert back!
            self.table.columns = [
                x.replace(".", "-") for x in self.table.columns
            ]
        else:
            tmp_counts = self.clone()
            tmp_counts.table = df
            tmp_counts.table.columns = [
                x.replace(".", "-") for x in tmp_counts.table.columns
            ]
            return tmp_counts
Beispiel #39
0
    def plotPCA(self,
                design,
                variance_plot_filename=None,
                pca_plot_filename=None,
                x_axis="PC1",
                y_axis="PC2",
                colour="group",
                shape="group"):
        ''' use the prcomp function in base R to perform principal components
        analysis.

        Can specify colour and shape as either variables from design table
        or sample names (seperated into id_1, id_2, id_3 based on samples
        having names formated e.g Tissue-Treatment-Replicate)'''

        # TS: swap this for regexes
        assert (x_axis[0:2] == "PC" and y_axis[0:2] == "PC"),\
            "x_axis and y_axis names must start with 'PC'"

        r_counts = pandas2ri.py2ri(self.table)
        r_design = pandas2ri.py2ri(design.table)

        pc_number_1 = int(x_axis.replace("PC", ""))
        pc_number_2 = int(y_axis.replace("PC", ""))

        makePCA = R('''
        function(counts, design){

          suppressMessages(library(ggplot2))
          suppressMessages(library(grid))

          gene_pca <- prcomp(t(counts), center = TRUE)

          m_text = element_text(size=12)
          s_text = element_text(size=8)


          variance = gene_pca$sdev^2
          variance_explained = round(variance/sum(variance), 5)

          variance_df = data.frame("Variance_explained" = variance_explained,
                                 "PC" = seq(1, length(variance)))
          p_variance = ggplot(variance_df, aes(x=PC, y=Variance_explained))+
          geom_point()+
          geom_line()+
          theme_bw()+
          ylab("Variance explained (%%)")+
          theme(axis.text.x = m_text,
                axis.title.y = m_text,
                axis.title.x = m_text,
                axis.text.y = m_text)

          ggsave("%(variance_plot_filename)s", width=10, height=10, unit="cm")

          PCs_df = data.frame(gene_pca$x)
          PCs_df['sample'] <- rownames(PCs_df)
          design['sample'] <- gsub("-", ".", rownames(design))

          PCs_df = merge(PCs_df, design)

          PCs_df$id_1 = sapply(strsplit(PCs_df$sample, "\\\."), "[", 1)
          PCs_df$id_2 = sapply(strsplit(PCs_df$sample, "\\\."), "[", 2)
          PCs_df$id_3 = sapply(strsplit(PCs_df$sample, "\\\."), "[", 3)

          p_pca = ggplot(PCs_df, aes(x=%(x_axis)s, y=%(y_axis)s)) +
          geom_point(size=3,
                     aes(shape=as.factor(%(shape)s),
                         colour=as.factor(%(colour)s))) +
          scale_colour_discrete(name=guide_legend(title='%(colour)s')) +
          scale_shape_discrete(name=guide_legend(title='%(shape)s')) +
          xlab(paste0('PC%(pc_number_1)i (Variance explained = ' ,
                       round(100 * variance_explained[%(pc_number_1)i], 1),
                       '%%)')) +
          ylab(paste0('PC%(pc_number_2)i (Variance explained = ' ,
                       round(100 * variance_explained[%(pc_number_2)i], 1),
                       '%%)')) +
          theme_bw() +
          theme(axis.text.x = s_text, axis.text.y = s_text,
                title = m_text, legend.text = m_text,
                legend.title = m_text)

          ggsave("%(pca_plot_filename)s", width=10, height=10, unit="cm")

        }''' % locals())

        makePCA(r_counts, r_design)
Beispiel #40
0
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir     = myCommandLine.args['outDir']
    group1     = myCommandLine.args['group1']
    group2     = myCommandLine.args['group2']
    batch      = myCommandLine.args['batch']  
    matrix     = myCommandLine.args['matrix']
    prefix     = myCommandLine.args['prefix']
    formula    = myCommandLine.args['formula']

    print("running DESEQ2 %s" % prefix, file=sys.stderr)

    # make the quant DF
    quantDF  = pd.read_table(matrix, header=0, sep='\t', index_col=0)
    df = pandas2ri.py2ri(quantDF)

    # import formula
    formulaDF     = pd.read_csv(formula,header=0, sep="\t",index_col=0)
    sampleTable = pandas2ri.py2ri(formulaDF)


    if "batch" in list(formulaDF):
        design = Formula("~ batch + condition")
    else:
        design = Formula("~ condition")
   

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods   = importr('methods')
    deseq     = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman     = importr('qqman')



    ### RUN DESEQ2 ###
    R.assign('df', df)
    R.assign('sampleTable', sampleTable)
    R.assign('design',design)
    R('dds <- DESeqDataSetFromMatrix(countData = df, colData = sampleTable, design = design)')
    R('dds <- DESeq(dds)')
    R('name <- grep("condition", resultsNames(dds), value=TRUE)')

    ###
    ###
    # Get Results and shrinkage values
    res    = R('results(dds, name=name)')
    resLFC = R('lfcShrink(dds, coef=name)')
    vsd    = R('vst(dds,blind=FALSE)')
    resdf  = robjects.r['as.data.frame'](res) 
    reslfc = robjects.r['as.data.frame'](resLFC)
    dds    = R('dds')

    
    ### Plotting section ###
    # plot MA and PC stats for the user
    plotMA    = robjects.r['plotMA']
    plotDisp  = robjects.r['plotDispEsts']
    plotPCA   = robjects.r['plotPCA']
    plotQQ    = robjects.r['qq']
    
    # get pca data
    if "batch" in list(formulaDF):
        pcaData    = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    else:
        print(vsd)
        pcaData    = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    # arrange 


    data_folder = os.path.join(os.getcwd(), outdir)
    qcOut = os.path.join(data_folder, "%s_QCplots_%s_v_%s.pdf"  % (prefix,group1,group2))
    
    grdevices.pdf(file=qcOut)

    x = "PC1: %s" % int(percentVar[0]*100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1]*100) + "%% variance"

    if "batch" in list(formulaDF):
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()

    else:
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
    pp.plot()
    plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results")
    plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrhinkage")    
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic() + \
            ggplot2.ggtitle("pvalue distribution")
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()


    data_folder = os.path.join(os.getcwd(), outdir)
    lfcOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results_shrinkage.tsv"  % (prefix,group1,group2))
    resOut = os.path.join(data_folder, "%s_%s_v_%s_deseq2_results.tsv"  % (prefix,group1,group2))
   
    robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t")
    robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
Beispiel #41
0
def taxonomy(all_dataframes, sql_match_tables, errors_dict, project_code,
             login_info):
    errorLog("Function - taxonomy")
    message = "Custom Taxonomy: Start checks."
    statusLog("Starting Taxonomy Checks")
    errorLog(message)
    errorLog("project code: %s" % project_code)
    login_info = login_info.strip().split("-")
    login = str(login_info[0])
    agency = str(login_info[1])
    owner = str(login_info[2])
    year = str(login_info[3])
    project = str(login_info[4])

    assignment_table = ""
    custom_checks = ""
    summary_checks = ""
    summary_results_link = ""
    custom_redundant_checks = ""
    custom_errors = []
    custom_warnings = []
    custom_redundant_errors = []
    custom_redundant_warnings = []

    TIMESTAMP = str(session.get('key'))

    ### get date and time
    gettime = int(time.time())
    timestamp_date = datetime.datetime.fromtimestamp(gettime)

    # add submitted table names to list
    tables = []
    # match tablenames to tabs
    errorLog(all_dataframes.keys())
    for dataframe in all_dataframes.keys():
        df_sheet_and_table_name = dataframe.strip().split(" - ")
        errorLog(df_sheet_and_table_name)
        table_name = str(df_sheet_and_table_name[2])
        errorLog(table_name)
        if table_name == "tbl_taxonomysampleinfo":
            tables.append("sampleinfo")
            sampleinfo = all_dataframes[dataframe]
            sampleinfo['tmp_row'] = sampleinfo.index
        if table_name == "tbl_taxonomyresults":
            tables.append("result")
            result = all_dataframes[dataframe]
            result['tmp_row'] = result.index

    try:
        #####################
        ## CHECK FUNCTIONS ##
        #####################
        def checkData(statement, column, warn_or_error, error_label,
                      human_error, dataframe):
            errorLog("checkData warn_or_error: %s" % error_label)
            for item_number in statement:
                unique_error = '{"column": "%s", "error_type": "%s", "error": "%s"}' % (
                    column, warn_or_error, human_error)
                if error_label == 'error':
                    addErrorToList("custom_errors", item_number, unique_error,
                                   dataframe)
                    errorsCount(errors_dict, 'custom')
                if error_label == 'warning':
                    addErrorToList("custom_errors", item_number, unique_error,
                                   dataframe)
                    # do not count warnings as errors - submission allowed - errorsCount('custom')
        def checkLogic(statement, column, warn_or_error, error_label,
                       human_error, dataframe):
            for item_number in statement:
                unique_error = '{"column": "%s", "error_type": "%s", "error": "%s"}' % (
                    column, warn_or_error, human_error)
                addErrorToList("custom_errors", item_number, unique_error,
                               dataframe)
                errorsCount(errors_dict, 'custom')

##################

        ## LOGIC CHECKS ##
        ##################
        errorLog("Starting Taxonomy Logic Checks")
        statusLog("Starting Taxonomy Logic Checks")
        # each sampleinfo information record must have a corresponding result record. records are matched on stationcode, sampledate, fieldreplicate.
        errorLog(
            "## EACH SAMPLEINFO INFORMATION RECORD MUST HAVE A CORRESPONDING RESULT RECORD. RECORDS ARE MATCHED ON STATIONCODE, SAMPLEDATE, FIELDREPLICATE ##"
        )
        errorLog(sampleinfo[
            ~sampleinfo[['stationcode', 'sampledate', 'fieldreplicate']].
            isin(result[['stationcode', 'sampledate', 'fieldreplicate']].
                 to_dict(orient='list')).all(axis=1)])
        checkLogic(
            sampleinfo[
                ~sampleinfo[['stationcode', 'sampledate', 'fieldreplicate']].
                isin(result[['stationcode', 'sampledate', 'fieldreplicate']].
                     to_dict(orient='list')).all(axis=1)].index.tolist(),
            'StationCode/SampleDate/FieldReplicate', 'Logic Error', 'error',
            'Each Taxonomy SampleInfo record must have a corresponding Taxonomy Result record. Records are matched on StationCode,SampleDate, and FieldReplicate.',
            sampleinfo)
        errorLog(result[
            ~result[['stationcode', 'sampledate', 'fieldreplicate']].
            isin(sampleinfo[['stationcode', 'sampledate', 'fieldreplicate']].
                 to_dict(orient='list')).all(axis=1)])
        checkLogic(
            result[~result[['stationcode', 'sampledate', 'fieldreplicate']].
                   isin(sampleinfo[
                       ['stationcode', 'sampledate', 'fieldreplicate']].
                        to_dict(orient='list')).all(axis=1)].index.tolist(),
            'StationCode/SampleDate/FieldReplicate', 'Logic Error', 'error',
            'Each Taxonomy Result record must have a corresponding Taxonomy SampleInfo record. Records are matched on StationCode,SampleDate, and FieldReplicate.',
            result)

        ###################
        ## CUSTOM CHECKS ##
        ###################
        message = "Starting Custom Taxonomy Checks"
        errorLog(message)
        statusLog(message)
        ## Jordan - Taxonomicqualifier Multi Value Lookup List: check to make sure taxonomicqualifier field data is valid (multiple values may be accepted).
        errorLog(result['taxonomicqualifier'])
        errorLog(
            "Taxonomicqualifier Multi Value Lookup List: check to make sure taxonomicqualifier field data is valid (multiple values may be accepted)."
        )
        nan_rows, invalid_codes, subcodes = dcValueAgainstMultipleValues(
            current_app.eng, 'lu_taxonomicqualifier', 'taxonomicqualifiercode',
            result, 'taxonomicqualifier')
        errorLog("Check submitted data for at least one code:")
        checkData(
            nan_rows, 'TaxonomicQualifier', 'Custom Error', 'error',
            'At least one TaxonomicQualifier code required please check the list: <a href=http://smcchecker.sccwrp.org/smc/scraper?action=help&layer=lu_taxonomicqualifier target=_blank>lu_taxonomicqualifier</a>.',
            result)
        errorLog(
            "Check submitted data for invalid code (or code combination):")
        checkData(
            invalid_codes, 'TaxonomicQualifier', 'Custom Error', 'error',
            'At least one TaxonomicQualifier code is invalid please check the list: <a href=http://smcchecker.sccwrp.org/smc/scraper?action=help&layer=lu_taxonomicqualifier target=_blank>lu_taxonomicqualifier</a>',
            result)

        ## Jordan -  Sample/Result SampleDate field - make sure user did not accidentally drag down date
        errorLog(
            'Sample/Result SampleDate field - make sure user did not accidentally drag down date'
        )
        # If every date submitted is consecutive from the first, it will error out every row. Otherwise, no error is thrown.
        if sampleinfo.sampledate.diff()[1:].sum() == pd.Timedelta(
                '%s day' % (len(sampleinfo) - 1)):
            checkData(
                sampleinfo.loc[sampleinfo.sampledate.diff() == pd.Timedelta(
                    '1 day')].tmp_row.tolist(), 'SampleDate', 'Custom Error',
                'Error',
                'Consecutive Dates. Make sure you did not accidentally drag down the date',
                sampleinfo)
        if result.sampledate.diff()[1:].sum() == pd.Timedelta(
                '%s day' % (len(result) - 1)):
            checkData(
                result.loc[result.sampledate.diff() == pd.Timedelta(
                    '1 day')].tmp_row.tolist(), 'SampleDate', 'Custom Error',
                'Error',
                'Consecutive Dates. Make sure you did not accidentally drag down the date',
                result)

## Jordan - FinalID / LifeStageCode combination must match combination found in vw_organism_lifestage_lookup
        errorLog(
            'FinalID / LifeStageCode combination must match combination found in vw_organism_lifestage_lookup'
        )
        # build list of FinalID/LifeStageCode combinations from lookup lists
        eng = create_engine('postgresql://*****:*****@192.168.1.17:5432/smc')
        lu_organisms = "SELECT organismcode, finalid, lifestagecode FROM vw_organism_lifestage_lookup;"
        #lu_organismdetaillookup = "SELECT organismcode, lifestagecode FROM lu_organismdetaillookup;"
        organisms = pd.read_sql_query(lu_organisms, eng)
        #organismdetaillookup = pd.read_sql_query(lu_organismdetaillookup,eng)
        #valid_pairs = organisms.merge(organismdetaillookup, on = ['organismcode'], how = 'inner')
        valid_pairs_list = list(organisms['finalid'] + '_' +
                                organisms['lifestagecode'])

        # compare pairs of submitted FinalID / LifeStageCode to valid_pairings from lookup lists
        errorLog(
            "result where FinalID/LifeStageCode does not match pair from lookup list:"
        )
        errorLog(
            result[pd.Series(result.finalid + '_' +
                             result.lifestagecode).isin(valid_pairs_list)])

        # perform check on data
        checkData(
            result[~pd.Series(result.finalid + '_' + result.lifestagecode).
                   isin(valid_pairs_list)].tmp_row.tolist(),
            'FinalID/LifeStageCode', 'Undefined Error', 'error',
            'FinalID/LifeStageCode pair is not valid. Refer to <a href=http://smcchecker.sccwrp.org/smc/scraper?action=help&layer=vw_organism_lifestage_lookup target=_blank>vw_organism_lifestage_lookup</a> for valid pairings',
            result)
        #####################
        ## START MAP CHECK ##
        #####################
        # get a unique list of stations from results file
        rlist_of_stations = pd.unique(result['stationcode'])
        result_unique_stations = ','.join("'" + s + "'"
                                          for s in rlist_of_stations)

        ################
        ## NEW FIELDS ##
        ################
        sampleinfo['project_code'] = project_code
        result['project_code'] = project_code

        ############################  Note: failure to run csci should not result in a failure
        ## BUILD and Process CSCI ##        to submit data - csci status should always = 0
        ############################

        # Dont run csci code if there are custom errors - data must be clean
        total_count = errors_dict['total']
        errorLog("total error count: %s" % total_count)
        errorLog("project code: %s" % project_code)
        if total_count == 0:

            message = "Starting CSCI Processing..."
            errorLog(message)
            statusLog(message)
            msgs = []
            # combine results and sampleinfo on stationcode we want to get collectionmethod field from sampleinfo
            bugs = pd.merge(
                result,
                sampleinfo[[
                    'stationcode', 'fieldsampleid', 'fieldreplicate',
                    'collectionmethodcode'
                ]],
                on=['stationcode', 'fieldsampleid', 'fieldreplicate'],
                how='left')

            # original submitted stations
            list_of_original_unique_stations = pd.unique(bugs['stationcode'])
            errorLog("list_of_original_unique_stations:")
            errorLog(list_of_original_unique_stations)
            unique_original_stations = ','.join(
                "'" + s + "'" for s in list_of_original_unique_stations)

            # concatenate stationcode, sampledate, collectionmethod, fieldreplicate into one field called sampleid
            errorLog("create sampleid:")
            # first get adjusted date
            bugs["samplerealdate"] = bugs["sampledate"].dt.strftime(
                '%m%d%Y').map(str)
            bugs["samplemonth"] = bugs["sampledate"].dt.strftime('%m').map(str)
            bugs["sampleday"] = bugs["sampledate"].dt.strftime('%d').map(str)
            bugs["sampleyear"] = bugs["sampledate"].dt.strftime('%Y').map(str)
            # merge two
            bugs["codeanddate"] = bugs.stationcode.astype(str).str.cat(
                bugs['samplerealdate'], sep='_')
            # merge two
            bugs["collectionandreplicate"] = bugs.collectionmethodcode.astype(
                str).str.cat(bugs['fieldreplicate'].astype(str), sep='_')
            # merge both
            bugs["sampleid"] = bugs.codeanddate.str.cat(
                bugs.collectionandreplicate, sep='_')
            # drop temp columns
            bugs.drop(
                ['samplerealdate', 'codeanddate', 'collectionandreplicate'],
                axis=1,
                inplace=True)

            # BUGS IS BUILT OFF THE MERGENCE OF BUG FILE AND GISSTATIONCODEXWALK
            # BUT STATIONCODE SHOULD ACTUALLY BE GISCODE NOT STATIONCODE
            # ResultsTable:StationCode links to Crosswalk:StationCode, which links to GISMetrics:GISCode
            # call gisxwalk table using unique stationcodes and get databasecode and giscode
            errorLog("building xwalk...")
            eng = create_engine(
                'postgresql://*****:*****@192.168.1.17:5432/smc')
            sqlwalk = 'select stationcode,databasecode,giscode from lu_newgisstationcodexwalk where stationcode in (%s)' % unique_original_stations
            gisxwalk = pd.read_sql_query(sqlwalk, eng)

            #bugs = pd.merge(bugs,gisxwalk[['stationcode','giscode','databasecode']], on = ['stationcode'], how='inner')
            bugs = pd.merge(
                bugs,
                gisxwalk[['stationcode', 'giscode', 'databasecode']],
                on=['stationcode'],
                how='inner')

            # only process stations that have associated gismetric data
            missing_bugs_xwalk = set(list_of_original_unique_stations) - set(
                bugs.stationcode.tolist())

            # send email if stations missing GIS Metric data.
            if missing_bugs_xwalk:
                bad_stations = '\n'.join(str(x) for x in missing_bugs_xwalk)
                msgs.append('CSCI Error:\n')
                msgs.append(
                    'The following stations are missing GISXWalk data:\n')
                msgs.append(bad_stations)
                print msgs

# original stations translated to smc stations using giscode
            list_of_unique_stations = pd.unique(bugs['giscode'])
            errorLog("list_of_unique_stations:")
            errorLog(list_of_unique_stations)
            unique_stations = ','.join("'" + s + "'"
                                       for s in list_of_unique_stations)

            #### STATIONS IS BUILT OFF THE MERGENCE OF BUG FILE AND GISMETRICS
            errorLog("building gismetrics...")
            sqlmetrics = 'select * from tbl_newgismetrics'
            gismetrics = pd.read_sql_query(sqlmetrics, eng)
            # merge gismetrics and gisxwalk to get giscode into dataframe
            # merge bugs/stationcode and gismetrics/giscode
            # check stations
            test_stations = pd.unique(bugs['stationcode'])
            # problem - gismetrics stationcode is replacing bugs-originalsubmission stationcode thats a problem
            errorLog(test_stations)
            # copy bugs.stationcode to retain in stations below
            bugs['original_stationcode'] = bugs['stationcode']
            stations = pd.merge(gismetrics,
                                bugs[['giscode', 'original_stationcode']],
                                left_on=['stationcode'],
                                right_on=['giscode'],
                                how='inner')
            # drop gismetrics stationcode
            stations.drop(['stationcode'], axis=1, inplace=True)
            stations.rename(columns={'original_stationcode': 'stationcode'},
                            inplace=True)
            eng.dispose()
            # check stations
            test2_stations = pd.unique(stations['stationcode'])
            errorLog(test2_stations)

            # only process stations that have associated gismetric data
            missing_bugs_stations = set(list_of_unique_stations) - set(
                bugs.giscode.tolist())
            missing_stations_stations = set(list_of_unique_stations) - set(
                stations.giscode.tolist())

            # send email if stations missing GIS Metric data.
            if missing_bugs_stations | missing_stations_stations:
                bad_stations = '\n'.join(
                    str(x) for x in missing_bugs_stations.union(
                        missing_stations_stations))
                msgs.append('CSCI Error:\n')
                msgs.append(
                    'The following stations are missing GISMetric data:\n')
                msgs.append(bad_stations)
                print msgs

# drop unnecessary columns
            bugs.drop(bugs[[
                'fieldsampleid', 'unit', 'excludedtaxa',
                'personnelcode_labeffort', 'personnelcode_results',
                'enterdate', 'taxonomicqualifier', 'qacode', 'resqualcode',
                'labsampleid', 'benthicresultscomments',
                'agencycode_labeffort', 'tmp_row', 'result'
            ]],
                      axis=1,
                      inplace=True)

            # if row exists drop row, errors, and lookup_error
            if 'row' in bugs.columns:
                bugs.drop(bugs[['row', 'errors']], axis=1, inplace=True)
            if 'lookup_error' in bugs.columns:
                bugs.drop(bugs[['lookup_error']], axis=1, inplace=True)
            stations.drop(stations[['objectid', 'gdb_geomattr_data', 'shape']],
                          axis=1,
                          inplace=True)

            # rename field
            bugs = bugs.rename(
                columns={
                    'stationcode': 'StationCode',
                    'sampledate': 'SampleDate',
                    'fieldreplicate': 'FieldReplicate',
                    'collectionmethodcode': 'CollectionMethodCode',
                    'finalid': 'FinalID',
                    'lifestagecode': 'LifeStageCode',
                    'baresult': 'BAResult',
                    'databasecode': 'DatabaseCode',
                    'sampleid': 'SampleID',
                    'distinctcode': 'Distinct'
                })
            errorLog(bugs)

            # drop all duplicates
            stations.drop_duplicates(inplace=True)
            errorLog(stations)
            bugs_count = len(bugs.index)
            stations_count = len(stations.index)
            errorLog("bugs_count:")
            errorLog(bugs_count)
            errorLog("stations_count:")
            errorLog(stations_count)

            # UPDATE: If bugs or stations are empty, CSCI cannot be processed. -Jordan 4/23/2019
            if bugs_count == 0 or stations_count == 0:
                errorLog(
                    "bugs and stations could not be built. Do not process CSCI."
                )
                checkData(
                    sampleinfo.tmp_row.tolist(), 'stationcode',
                    'Undefined Warning', 'warning',
                    'The data you submitted does not meet the minimum requirements to process CSCI. You may continue submitting, but CSCI Reports cannot be generated at this time.',
                    sampleinfo)
            else:
                # Import and Execute cleanData and CSCI functions
                import rpy2
                import rpy2.robjects as robjects
                from rpy2.robjects import pandas2ri
                import rpy2.robjects.packages as rpackages
                from rpy2.robjects.packages import importr
                import rpy2.rinterface as rinterface

                # shortens notation for accessing robjects
                r = robjects.r

                # imports R package: CSCI
                CSCI = importr('CSCI')

                # convert cleanData() and CSCI() functions from CSCI package to python
                cd = CSCI.cleanData
                csci = CSCI.CSCI

                # collect errors and error counts for each group
                error_count = {'clean data': 0, 'CSCI': 0}
                cd_group_errors = []
                csci_group_errors = []

                # process cleanData and CSCI for each Sample
                bugs_grouped = bugs.groupby(['SampleID'])

                # open log file for printing status
                TIMESTAMP = str(int(round(time.time() * 1000)))
                logfile = '/var/www/smc/testfiles/' + TIMESTAMP + '.log'

                # Activate R to Python DataFrame conversions
                pandas2ri.activate()

                start_time = int(time.time())
                count = 0
                for name, group in bugs_grouped:
                    # print current group
                    print "group name: %s" % (name)
                    bug_sample_id = name

                    # group stationcode to get just one
                    single_station = group.StationCode.unique()

                    # check to makesure there is only one
                    print "stations_grouped: %s" % single_station[0]

                    # find stationcode that matches between the bugs record and what is in stations
                    station = stations.loc[stations['stationcode'] ==
                                           single_station[0]]

                    # convert group, station to R dataframe
                    errorLog("convert group, station to R dataframe")
                    group = pandas2ri.py2ri(group)
                    station = pandas2ri.py2ri(station)

                    # copy of group
                    errorLog(
                        "make a copy of group and adjust sampledate fields")
                    group_copy = group
                    #errorLog("group_copy:")
                    #errorLog(group_copy)
                    group_copy = pandas2ri.ri2py(group_copy)
                    #errorLog("list group copy:")
                    #errorLog(list(group_copy))
                    group_copy.columns = [
                        x.lower() for x in group_copy.columns
                    ]
                    # get samplemonth, sampleday, sampleyear for later use
                    #group_copy["sampledate"] = pd.datetime.strptime(group_copy['sampledate'], '%Y-%m-%d')
                    #group_copy["samplemonth"] = group_copy.sampledate.dt.month
                    #group_copy["sampleday"] = group_copy.sampledate.dt.day
                    #group_copy["sampleyear"] = group_copy.sampledate.dt.year
                    '''
                                # clean group with cleanData()
                                cd_list = cd(group,msgs=True)
                                group = cd_list[0]
                                warn_msg = cd_list[1]

                                # if data cannot be cleaned, prepare email message
                                if warn_msg[0] != 'Data already clean':
                                    errorLog('cleanData Failed:\n')
                                    bad_station = 'cleanData failed on station %s:\n' %single_station[0]
                                    bad_group = 'Sample %s could not be cleaned because %s.' %(bug_sample_id,warn_msg[0])
                                    errorLog(bad_station)
                                    errorLog(bad_group)
                                    msgs.append('CSCI Error:\n')
                                    msgs.append(bad_station)
                                    msgs.append(bad_group)
                                    
                                else:
                                '''
                    try:
                        errorLog("data is clean process csci")
                        errorLog(station)
                        errorLog(group)
                        report = csci(group, station)

                        # assign csci elements to proper tables
                        errorLog("assign elements to specific tables")
                        core = pandas2ri.ri2py(report[0])
                        s1mmi = pandas2ri.ri2py(report[1])
                        s1grps = pandas2ri.ri2py(report[2])
                        s1oe = pandas2ri.ri2py(report[3])
                        s2oe = pandas2ri.ri2py(report[4])
                        s2mmi = pandas2ri.ri2py(report[5])

                        # fields that need to be filled
                        errorLog("first - csci")
                        errorLog(core)
                        core.columns = [x.lower() for x in core.columns]
                        core['processed_by'] = "checker"
                        core['cleaned'] = "Yes"
                        core['scorenotes'] = "Distinct set to NA"
                        core['rand'] = 2
                        core['scoredate'] = timestamp_date
                        core[
                            'record_origin'] = project  # should probably be SMC
                        core['origin_lastupdatedate'] = timestamp_date
                        core['record_publish'] = "False"
                        core = pd.merge(core,
                                        group_copy[[
                                            'sampleid', 'sampledate',
                                            'sampleday', 'samplemonth',
                                            'sampleyear',
                                            'collectionmethodcode',
                                            'fieldreplicate'
                                        ]],
                                        on=['sampleid'],
                                        how='left')
                        core['sampledate'] = pd.to_datetime(core['sampledate'],
                                                            unit='s').dt.date
                        core = core.drop_duplicates()
                        core_file = "/var/www/smc/logs/%s.core.csv" % TIMESTAMP
                        # only show header once

                        if count == 0:
                            core.to_csv(core_file,
                                        sep=',',
                                        mode='a',
                                        encoding='utf-8',
                                        index=False)
                        else:
                            # skip next loop
                            core.to_csv(core_file,
                                        sep=',',
                                        mode='a',
                                        encoding='utf-8',
                                        index=False,
                                        header=False)

                        errorLog("second - s1mmi")
                        s1mmi.columns = [x.lower() for x in s1mmi.columns]
                        s1mmi['processed_by'] = "checker"
                        s1mmi.rename(columns={
                            'coleoptera_percenttaxa_predicted':
                            'coleoptera_percenttaxa_predict'
                        },
                                     inplace=True)
                        s1mmi[
                            'record_origin'] = project  # should probably be SMC
                        s1mmi['origin_lastupdatedate'] = timestamp_date
                        s1mmi['record_publish'] = "False"
                        s1mmi = s1mmi.drop_duplicates()
                        s1mmi_file = "/var/www/smc/logs/%s.Suppl1_mmi.csv" % TIMESTAMP
                        # only show header once
                        if count == 0:
                            s1mmi.to_csv(s1mmi_file,
                                         sep=',',
                                         mode='a',
                                         encoding='utf-8',
                                         index=False)
                        else:
                            # skip next loop
                            s1mmi.to_csv(s1mmi_file,
                                         sep=',',
                                         mode='a',
                                         encoding='utf-8',
                                         index=False,
                                         header=False)

                        errorLog("third - s2mmi")
                        s2mmi.columns = [x.lower() for x in s2mmi.columns]
                        s2mmi['processed_by'] = "checker"
                        s2mmi[
                            'record_origin'] = project  # should probably be SMC
                        s2mmi['origin_lastupdatedate'] = timestamp_date
                        s2mmi['record_publish'] = "False"
                        s2mmi = s2mmi.drop_duplicates()
                        s2mmi_file = "/var/www/smc/logs/%s.Suppl2_mmi.csv" % TIMESTAMP
                        # only show header once
                        if count == 0:
                            s2mmi.to_csv(s2mmi_file,
                                         sep=',',
                                         mode='a',
                                         encoding='utf-8',
                                         index=False)
                        else:
                            # skip next loop
                            s2mmi.to_csv(s2mmi_file,
                                         sep=',',
                                         mode='a',
                                         encoding='utf-8',
                                         index=False,
                                         header=False)

                        errorLog("fourth - s1grps")
                        s1grps.columns = [x.lower() for x in s1grps.columns]
                        s1grps['processed_by'] = "checker"
                        s1grps[
                            'record_origin'] = project  # should probably be SMC
                        s1grps['origin_lastupdatedate'] = timestamp_date
                        s1grps['record_publish'] = "False"
                        s1grps = s1grps.drop_duplicates()
                        s1grps_file = "/var/www/smc/logs/%s.Suppl1_grps.csv" % TIMESTAMP
                        # only show header once
                        if count == 0:
                            s1grps.to_csv(s1grps_file,
                                          sep=',',
                                          mode='a',
                                          encoding='utf-8',
                                          index=False)
                        else:
                            # skip next loop
                            s1grps.to_csv(s1grps_file,
                                          sep=',',
                                          mode='a',
                                          encoding='utf-8',
                                          index=False,
                                          header=False)

                        errorLog("fifth - s1oe")
                        s1oe.columns = [x.lower() for x in s1oe.columns]
                        #print s1oe
                        #s1oe['objectid'] = s1oe.apply(lambda x: int(x.objectid) + x.index, axis=1)
                        s1oe['processed_by'] = "checker"
                        s1oe[
                            'record_origin'] = project  # should probably be SMC
                        s1oe['origin_lastupdatedate'] = timestamp_date
                        s1oe['record_publish'] = "False"
                        s1oe = s1oe.drop_duplicates()
                        s1oe_file = "/var/www/smc/logs/%s.Suppl1_OE.csv" % TIMESTAMP
                        # only show header once
                        if count == 0:
                            s1oe.to_csv(s1oe_file,
                                        sep=',',
                                        mode='a',
                                        encoding='utf-8',
                                        index=False)
                        else:
                            # skip next loop
                            s1oe.to_csv(s1oe_file,
                                        sep=',',
                                        mode='a',
                                        encoding='utf-8',
                                        index=False,
                                        header=False)

                        errorLog("sixth - s2oe")
                        s2oe.columns = [x.lower() for x in s2oe.columns]
                        # fill na with -88
                        #s2oe.fillna(-88, inplace=True)
                        s2oe['captureprob'].replace(['NA'], -88, inplace=True)
                        s2oe['processed_by'] = "checker"
                        s2oe[
                            'record_origin'] = project  # should probably be SMC
                        s2oe['origin_lastupdatedate'] = timestamp_date
                        s2oe['record_publish'] = "False"
                        s2oe = s2oe.drop_duplicates()
                        s2oe_file = "/var/www/smc/logs/%s.Suppl2_OE.csv" % TIMESTAMP
                        # only show header once
                        if count == 0:
                            s2oe.to_csv(s2oe_file,
                                        sep=',',
                                        mode='a',
                                        encoding='utf-8',
                                        index=False)
                        else:
                            # skip next loop
                            s2oe.to_csv(s2oe_file,
                                        sep=',',
                                        mode='a',
                                        encoding='utf-8',
                                        index=False,
                                        header=False)

                        summary_results_link = TIMESTAMP

                        count = count + 1
                        #file_to_get = "/var/www/smc/logs/%s.core.csv" % TIMESTAMP
                        #errorLog("file to get:")
                        #errorLog(file_to_get)
                        #all_dataframes["2 - core_csv - tmp_cscicore"] = pd.read_csv('/var/www/smc/logs/%s.core.csv' % TIMESTAMP)
                        #all_dataframes["2 - core_csv - tmp_cscicore"].columns = [x.lower() for x in all_dataframes["2 - core_csv - tmp_cscicore"].columns]

                        ## WHAT HAPPENS IF CSCI SCORE IS ALREADY IN DATABASE - MAY WANT TO CHECK ABOVE
                        #errorLog("print core_csv columns:")
                        #errorLog(list(all_dataframes["2 - core_csv - tmp_cscicore"]))
                        #errorLog("remove index:")
                        #all_dataframes["2 - core_csv - tmp_cscicore"].drop(['unnamed: 0'],axis=1, inplace=True)
                        #errorLog(list(all_dataframes["2 - core_csv - tmp_cscicore"]))
                        #errorLog(all_dataframes["2 - core_csv - tmp_cscicore"])
                        #
                        #summary_results_link = 'http://smcchecker.sccwrp.org/smc/logs/%s.core.csv' % TIMESTAMP
                        #summary_results_link = TIMESTAMP

                        ### IMPORTANT LOAD ONE CSCI FIELD FROM CSV FILE AND MAP IT TO EXISTING BUGS/STATIONS DATAFRAME THEN OUTPUT TO CSV LOAD FILE FOR IMPORT
                        ### AT STAGING INTO DATABASES
                        message = "Success CSCI"
                        errorLog(message)
                        '''
                                        # code below wont work do to sampledate getting changed to number instead of date - fails on submission
                                        all_dataframes["2 - CSCI_Core - csci_core"] = core
                                        all_dataframes["3 - CSCI_Suppl1_MMI - csci_suppl1_mmi"] = s1mmi
                                        all_dataframes["4 - CSCI_Suppl2_MMI - csci_suppl2_mmi"] = s2mmi
                                        all_dataframes["5 - CSCI_Suppl1_GRPS - csci_suppl1_grps"] = s1grps
                                        all_dataframes["6 - CSCI_Suppl1_OE - csci_suppl1_oe"] = s1oe
                                        all_dataframes["7 - CSCI_Suppl2_OE - csci_suppl2_oe"] = s2oe
                                        '''

                        message = str(msgs)
                        state = 0
                    except Exception as e:
                        # here is where we email sccwrp to let them know we couldnt get csci score for sampleid - we still need load the data and try to load other sampleids
                        bad_station = '\n CSCI Processing Failed on station %s:\n' % single_station[
                            0]
                        bad_group = 'Sample %s could not be processed because %s.\n' % (
                            bug_sample_id, e[0])

                        msgs.append('CSCI Error:\n')
                        msgs.append(bad_station)
                        msgs.append(bad_group)

                        errorLog("CSCI ran into the following error: %s" %
                                 e[0])
                        msgs.append('Failed to run csci\n')

                all_dataframes["2 - CSCI_Core - csci_core"] = pd.read_csv(
                    "/var/www/smc/logs/%s.core.csv" % TIMESTAMP)
                all_dataframes[
                    "3 - CSCI_Suppl1_MMI - csci_suppl1_mmi"] = pd.read_csv(
                        "/var/www/smc/logs/%s.Suppl1_mmi.csv" % TIMESTAMP)
                all_dataframes[
                    "4 - CSCI_Suppl2_MMI - csci_suppl2_mmi"] = pd.read_csv(
                        "/var/www/smc/logs/%s.Suppl2_mmi.csv" % TIMESTAMP)
                all_dataframes[
                    "5 - CSCI_Suppl1_GRPS - csci_suppl1_grps"] = pd.read_csv(
                        "/var/www/smc/logs/%s.Suppl1_grps.csv" % TIMESTAMP)
                all_dataframes[
                    "6 - CSCI_Suppl1_OE - csci_suppl1_oe"] = pd.read_csv(
                        "/var/www/smc/logs/%s.Suppl1_OE.csv" % TIMESTAMP)
                all_dataframes[
                    "7 - CSCI_Suppl2_OE - csci_suppl2_oe"] = pd.read_csv(
                        "/var/www/smc/logs/%s.Suppl2_OE.csv" % TIMESTAMP)
                message = msgs
                errorLog(message)
                state = 0

        for dataframe in all_dataframes.keys():
            if 'custom_errors' in all_dataframes[dataframe]:
                custom_errors.append(
                    getCustomErrors(all_dataframes[dataframe], dataframe,
                                    'custom_errors'))
                custom_redundant_errors.append(
                    getCustomRedundantErrors(all_dataframes[dataframe],
                                             dataframe, "custom_errors"))
            if 'custom_warnings' in all_dataframes[dataframe]:
                errorLog("custom_warnings")
                custom_errors.append(
                    getCustomErrors(all_dataframes[dataframe], dataframe,
                                    'custom_warnings'))
                errorLog(custom_warnings)
                custom_redundant_errors.append(
                    getCustomRedundantErrors(all_dataframes[dataframe],
                                             dataframe, "custom_warnings"))
        custom_checks = json.dumps(custom_errors, ensure_ascii=True)
        custom_redundant_checks = json.dumps(custom_redundant_errors,
                                             ensure_ascii=True)
        ## END RETRIEVE ERRORS ##
        # get filenames from fileupload routine
        errorLog(message)
        #assignment_table = result.groupby(['stationid','lab','analyteclass']).size().to_frame(name = 'count').reset_index()
        # lets reassign the analyteclass field name to species so the assignment query will run properly - check StagingUpload.py for details
        #assignment_table = assignment_table.rename(columns={'analyteclass': 'species'})
        return assignment_table, custom_checks, custom_redundant_checks, summary_checks, summary_results_link, message, result_unique_stations
    except ValueError:
        message = "Critical Error: Failed to run taxonomy checks"
        errorLog(message)
        state = 1
        return jsonify(message=message, state=state)
Beispiel #42
0
def trt(df1, treated, control):  # pandas df, treated columns, control columns
    df = df1.copy()
    combined = treated + control
    df[combined] = df[combined].astype(float)
    df = df.reset_index()
    del df['index']
    names = {}
    ren = {}
    count = 1
    tr = []
    ct = []
    for i in df.columns:
        col = 'column{}'.format(count)
        names[col] = i
        ren[i] = col
        if i in treated:
            tr.append(col)
        elif i in control:
            ct.append(col)
        count += 1
    df.rename(columns=ren, inplace=True)
    c = "library(limma)"
    ro.r(c)
    c = "library(qvalue)"
    ro.r(c)
    ctot = ct + tr
    rdf = pandas2ri.py2ri(df)
    ro.globalenv['data'] = rdf
    c = "str(data)"
    c = "data[ is.na(data) ] <- NA"
    ro.r(c)
    c = "tr <- c{}".format(tuple(tr))
    ro.r(c)
    #print c
    c = "ct <- c{}".format(tuple(ct))
    ro.r(c)
    #print c
    c = "str(ct)"
    #print ro.r(c)
    #c='''source("http://www.biostat.jhsph.edu/~kkammers/software/eupa/source.functions.r")'''
    c = '''source("source.functions.r.txt")'''
    ro.r(c)
    c = 'data'
    design = []
    [design.append(2) for i in tr]
    [design.append(1) for i in ct]
    c = "design <- model.matrix(~factor(c{}))".format(tuple(design))
    #print c
    ro.r(c)
    #print ro.r('str(data)')
    c = '''colnames(design) <- c("Intercept", "Diff")'''
    ro.r(c)
    c = 'res.eb <- trt.fit(data[, c(tr,ct)], design)'
    ro.r(c)
    c = "res.eb"
    #print ro."r(c)
    dfebi = pandas2ri.ri2py(ro.r[c])
    assert 'index' not in dfebi.columns
    dfebi.index = dfebi.reset_index()['index'].apply(int)
    df = df.join(dfebi)
    df = df.rename(columns=names)
    df = df[df['adj.P.Val'].notnull()]
    df = df.sort('adj.P.Val', ascending=True)
    return df
Beispiel #43
0
def ANOVA_RM(dataframe, val, var, subject):  # pandas df, col to do anova on
    #c = "sink('{}')".format(outpath); ro.r(c)
    c = "library('FSA')"
    ro.r(c)
    c = "library('nlme')"
    ro.r(c)
    c = "require(multcomp)"
    ro.r(c)
    #var_dct = {}
    #count = 0
    #for v in list(set(dataframe[var])):
    #    var_dct[v] = str(count)
    #    count += 1
    #dataframe['tempvar'] = dataframe[var].apply(lambda x : var_dct[x])
    rdf = pandas2ri.py2ri(dataframe)
    ro.globalenv['data'] = rdf
    #c = "library(plyr)";ro.r(c)
    try:
        c = "your.bartlett = bartlett.test(data${}~data${})".format(val, var)
        ro.r(c)
        c = "print(your.bartlett)"
        ro.r(c)
        bartlett = robjects.r('your.bartlett$p.value')[0]
    except:
        bartlett = np.nan
    try:
        c = "your.fligner = fligner.test(data${}~data${})".format(val, var)
        ro.r(c)
        c = "print(your.fligner)"
        ro.r(c)
        fligner = robjects.r('your.fligner$p.value')[0]
    except:
        fligner = np.nan

    #print bartlett, fligner
    try:
        c = "your.lme =  lme({} ~ {}, data=data,random=~1|{}/{})".format(
            val, var, subject, var)
        ro.r(c)
        c = "your.anova = anova(your.lme)"
        ro.r(c)
        c = "your.anova"
        an = pandas2ri.ri2py(ro.r[c])
        an = an.loc[var].to_dict()
        c = "your.sum = summary(glht(your.lme, linfct=mcp({} = 'Tukey')), test = adjusted(type = 'bonferroni'))".format(
            var)
        ro.r(c)
        c = "pvals = your.sum[10]$test$pvalues"
        ro.r(c)
        c = 'data.frame = data.frame(as.list(pvals))'
        ro.r(c)
        c = 'data.frame'
        df = pandas2ri.ri2py(ro.r[c])
        df = df.reset_index()
        del df['index']
        df.loc[0, 'Tukey correction'] = 'Bonferroni'
        for key in an:
            df.loc[0, 'ANOVA ' + key] = an[key]
    except:
        df = pd.DataFrame()

    return df
def import_data(out_itr,
                evalTime,
                categorical_columns=None,
                continuous_columns=None):
    """Preprocess the data to use them to the model to train, validate and predict
     Arguments: 
        out_itr: indicator of set of 5-fold cross validation data out of 5 simulated dataset
        evalTime: Evaluation times 
        categorical_columns: A list of name of the categorical columns in the dataframe
        continuous_columns: A list of name of the continuous columns in the dataframe
    Returns:
        All the attributes that will be used in the model to train, validate and predict
    """

    ### Loading Data from the folder named as the dataset (Synthetic/WIHS/SEER) in the code directory
    train_df = pd.read_csv('Synthetic/train_data_{}.csv'.format(out_itr))
    val_df = pd.read_csv('Synthetic/valid_data_{}.csv'.format(out_itr))
    test_df = pd.read_csv('Synthetic/test_data_{}.csv'.format(out_itr))

    #Create a column 'train' to trainining, validation and test data and combined them. Then convert the the categorical variables into dummy variables on combined data so that the number of columns in all three dataset remain equal.
    train_df['train'] = 1
    val_df['train'] = 2
    test_df['train'] = 3
    df = pd.concat([train_df, val_df, test_df])

    #Convert the categorical variables into dummy variables
    if categorical_columns is not None:
        df = to_one_hot(df, categorical_columns)
    train_data = df[df['train'] == 1]
    val_data = df[df['train'] == 2]
    test_data = df[df['train'] == 3]

    #Drop the 'train' column from all three datasets.
    train_data = train_data.drop(columns=['train'])
    val_data = val_data.drop(columns=['train'])
    test_data = test_data.drop(columns=['train'])

    #Standardize the contunuous columns
    if continuous_columns is not None:
        train_data = standardized(train_data, train_data, continuous_columns)
        val_data = standardized(train_data, val_data, continuous_columns)
        test_data = standardized(train_data, test_data, continuous_columns)

    #Full Dataset
    dataset = df.drop(columns=['train'])
    label = np.asarray(dataset[['status']])
    time = np.asarray(dataset[['time']])
    data = np.asarray(dataset.drop(columns=['status', 'time']))

    num_Category = int(np.max(time) * 1.2)  #to have enough time-horizon
    num_Event = int(len(np.unique(label)) -
                    1)  #the number of events (excluding censoring as an event)
    num_evalTime = len(evalTime)  #No. of evaluation times

    #Preprocess the Training Data
    tr_time = np.asarray(train_data[['time']])
    tr_label = np.asarray(train_data[['status']])
    eval_time = FloatVector(evalTime)
    #Convert the 'Python' dataframe to 'R'
    with localconverter(default_converter + pandas2ri.converter) as cv:
        train_data_pseudo = pandas2ri.py2ri(train_data)
    train_pseudo_data = get_conditional_pseudo_data(train_data_pseudo,
                                                    eval_time)
    train_pseudo = pandas2ri.ri2py(train_pseudo_data)
    tr_data = train_pseudo.drop(['y'], axis=1)
    tr_data = np.asarray(tr_data)
    x_dim = np.shape(tr_data)[1]
    y_train = np.asarray(train_pseudo.loc[:, 'y'])

    #Preprocess the Validation Data
    va_time = np.asarray(val_data[['time']])
    va_label = np.asarray(val_data[['status']])
    #Convert the 'Python' dataframe to 'R'
    with localconverter(default_converter + pandas2ri.converter) as cv:
        val_data_pseudo = pandas2ri.py2ri(val_data)
    va_data = get_conditional_test_data(val_data_pseudo, eval_time)
    va_data = pandas2ri.ri2py(va_data)
    va_data = np.asarray(va_data)

    #Preprocess the Test Data
    te_time = np.asarray(test_data[['time']])
    te_label = np.asarray(test_data[['status']])
    #Convert the 'Python' dataframe to 'R'
    with localconverter(default_converter + pandas2ri.converter) as cv:
        test_data_pseudo = pandas2ri.py2ri(test_data)
    te_data = get_conditional_test_data(test_data_pseudo, eval_time)
    te_data = pandas2ri.ri2py(te_data)
    te_data = np.asarray(te_data)

    return tr_data, tr_time, tr_label, y_train, va_data, va_time, va_label, te_data, te_time, te_label, num_Category, num_Event, num_evalTime, x_dim
 #fitting arima to find optimal params
 model = auto_arima(snp_returns_rolling)
 model.fit(snp_returns_rolling)
 #extracting p and q, require for feeding  into garch model
 p_ = model.order[0]
 o_ = model.order[1]
 q_ = model.order[2]
 arma_order = str(tuple([p_, q_]))
 #fitting Garch Model
 garch_spec = rugarch.ugarchspec(
     mean_model=robjects.r(
         "list(armaOrder = c{arma_order})".format(arma_order=arma_order)),
     variance_model=robjects.r('list(garchOrder=c(1,1))'),
     distribution_model='std')
 pandas2ri.activate()
 r_dataframe = pandas2ri.py2ri(snp_returns_rolling)
 # Train R GARCH model on returns as %
 garch_fitted = rugarch.ugarchfit(garch_spec, r_dataframe, solver='hybrid')
 pandas2ri.deactivate()
 #forecasting next point
 fore = rugarch.ugarchforecast(garch_fitted, n_ahead=1)
 forecast = np.array(fore.slots['forecast'].rx2('seriesFor')).flatten()[0]
 #storing signal, signal is basically sign of the forecast of return
 forecasted_returns.append({
     'date':
     gspc_returns.index[window_length + d].date(),
     'signal':
     np.sign(forecast)
 })
 print(gspc_returns.index[window_length + d].date())
 print(arma_order, forecast)
Beispiel #46
0
def MComBat(X, batch, ref_batch=None, covariate=None, num_covs=None, save_dir=None):
    # Check X
    if not isinstance(X, (pd.DataFrame, pd.Series)):
        if isinstance(X, (list, tuple, np.ndarray, Mapping)):
            df = pd.DataFrame(X)
        else:
            raise TypeError('X must be an array-like object, dictionary or pandas Dataframe/Series')
    else:
        df = X
    row_names = df.index
    r_df = pandas2ri.py2ri(df)
    # Check covariate
    if covariate is None:
        covariate = np.ones((len(batch), 1))
    else:
        if not isinstance(covariate, (list, tuple, np.ndarray)):
            if isinstance(covariate, pd.DataFrame) or isinstance(covariate, pd.Series):
                covariate = covariate.to_numpy()
            else:
                raise TypeError('covariate array must be an array like or pandas Dataframe/Series')
        else:
            covariate = np.array(covariate)
    if len(covariate.shape) == 1:
        covariate = covariate.reshape(-1, 1)
    elif len(covariate.shape) > 2:
        raise ValueError('covariate array must be 1D or 2D')
    nr, nc = covariate.shape
    r_covariate = r.matrix(covariate, nrow=nr, ncol=nc)
    # Check batch
    if not isinstance(batch, (list, tuple, np.ndarray)):
        if isinstance(batch, pd.DataFrame) or isinstance(batch, pd.Series):
            batch = batch.to_numpy()
        else:
            raise TypeError('batch array must be an array like or pandas Dataframe/Series')
    else:
        batch = np.array(batch)
    if len(batch.shape) != 1:
        if len(batch.shape) == 2 and batch.shape[1] == 1:
            batch.reshape(-1)
        else:
            raise ValueError('batch array must be 1D or 2D with second dimension equal to 1')
    if len(np.unique(batch)) <= 1:
        raise ValueError('batch array must have at least 2 classes')
    r_batch = Vector(batch)
    # Check ref batch
    if ref_batch is None:
        ref_batch = np.unique(batch)[0]
    else:
        if ref_batch not in np.unique(batch):
            raise ValueError('ref_batch must be one of np.unique(batch) values')
    # Check numCovs
    if num_covs is None:
        r_numCovs = NULL
    else:
        if isinstance(num_covs, int):
            num_covs = [num_covs]
        if not isinstance(num_covs, (list, tuple, np.ndarray)):
            raise TypeError('num_covs must be an int or array like of int equal to the index of numerical covariates')
        r_numCovs = Vector(num_covs)
    # cwd = os.path.dirname(sys.argv[0])
    cwd = os.path.dirname(os.path.abspath(__file__))
    r.setwd(cwd)
    # r.source('./Statistical_analysis/R_scripts/MComBat.R')
    r.source('./R_scripts/MComBat.R')
    r_dr_results = r.MComBat_harmonization(r_df, r_covariate, r_batch, ref_batch, r_numCovs)
    R_object_dict = {}
    keys = r_dr_results.names
    for i in range(len(keys)):
        R_object_dict[keys[i]] = np.array(r_dr_results[i])
    results = pd.DataFrame(R_object_dict)
    results.index = row_names
    if save_dir is not None:
        results.to_excel(os.path.join(save_dir, 'Feature_MComBat.xlsx'))
    return results
Beispiel #47
0
def r_cal_b(df):
    robjects.r('''
        # create a function `f`
        f <- function(df, verbose=FALSE) {
            if (verbose) {
                cat("I am calling f().\n")
            }       
            xMin<-min(df$x)
            xMax<-max(df$x)
            yMin<-min(df$y)
            yMax<-max(df$y)
            
            
            #xy_PPP <- with(df, ppp(x, y, c(-25,25), c(-25,25)))
            xy_PPP <- with(df, ppp(x, y, c(xMin,xMax), c(yMin,yMax)))
            plot(xy_PPP)
            
            xy=df
            summary(xy)
            xy <- unique(xy)
            xy<-data.matrix(xy)
            mc <- apply(xy, 2, mean)    
            sd <- sqrt(sum((xy[,1] - mc[1])^2 + (xy[,2] - mc[2])^2) / nrow(xy))
            buffer_area=25*25
            dens <- nrow(xy) / buffer_area
            library(spatstat)
            win<-owin(c(-25,25), c(-25,25))

            #library(devtools)
            #if (!require("rspatial")) devtools::install_github('rspatial/rspatial')
            #remotes::install_github("rspatial/rspatial")

            #devtools::install_github("rspatial/rspatial")
            #devtools::install_github("rstudio/sparkapi")
            
            #library(rspatial)
            #r <- raster(win)
            quadrat_C<-quadratcount(xy_PPP,nx=4,ny=4)
            #plot(quadrat_C)
            # number of quadrats
            quadrats <- sum(quadrat_C)
            f<-table(quadrat_C)
            f<-data.frame(f)
            # number of cases
            cases <- sum(as.integer(f$quadrat_C) * f$Freq)
            mu <- cases / quadrats
            
            ff <- data.frame(as.integer(f$quadrat_C),f$Freq)
            colnames(ff) <- c('K', 'X')
            ff$Kmu <- ff$K - mu
            ff$Kmu2 <- ff$Kmu^2
            ff$XKmu2 <- ff$Kmu2 * ff$X
            s2 <- sum(ff$XKmu2) / (sum(ff$X)-1)
            VMR <- s2 / mu
            
            Fs<-Fest(xy_PPP)
            #plot(Fs)
            Gs<-Gest(xy_PPP)
            #plot(Gs)
            
            km<-Fs$km[10]
            newlist<-list(VMR,km)
            print(VMR)
            return(newlist)
            
            #return(VMR)
        }}
        ''')
    r_f = robjects.r['f']
    pandas2ri.activate()
    r_DF = pandas2ri.py2ri(df[["x", "y"]])

    res = r_f(r_DF)
    print("+" * 50)
    print(res)
    return res
Beispiel #48
0
def VST(dataset):

    return pandas2ri.ri2py(r.vst(pandas2ri.py2ri(dataset['rawdata'])))
Beispiel #49
0
    def fit(self, X, y):

        from lnpy import linear as linear_models

        n_channels = self.n_channels

        # estimation a GAM using all dimensions is not working well;
        # thus we try to estimate the linear weights using a linear model
        # and then fit a GAM to the linear predictions for each channel

        from rpy2.robjects.packages import importr
        from rpy2.robjects import pandas2ri
        import rpy2.robjects as ro
        pandas2ri.activate()
        import pandas as pd
        try:
            import pandas.rpy.common as com
            com_available = True
        except BaseException:
            com_available = False

        mgcv = importr('mgcv')

        if self.linear_model is None:

            lin_model = linear_models.ARD(verbose=False)
            lin_model.fit(X, y)

        elif isinstance(self.linear_model, string_types):

            if self.linear_model.upper() == 'ARD':
                lin_model = linear_models.ARD(verbose=False)
            elif self.linear_model.upper() == 'RIDGE':
                lin_model = linear_models.Ridge(verbose=False)

            lin_model.fit(X, y)

        N = X.shape[0]
        w = np.copy(lin_model.get_weights())
        m = w.shape[0] / n_channels
        chan_ind = np.reshape(np.arange(w.shape[0]), (m, n_channels),
                              order='F')

        Yw_pred = np.zeros((N, n_channels))
        for j in range(n_channels):

            # predictions for channel j
            Yw_pred[:, j] = np.dot(X[:, chan_ind[:, j]], w[chan_ind[:, j]])

        # fit GAM
        YX = np.hstack((np.atleast_2d(y).T, Yw_pred))
        df = pd.DataFrame(YX, columns=self.columns)
        if com_available:
            df_r = com.convert_to_r_dataframe(df)
        else:
            try:
                df_r = pandas2ri.py2ri(df)
            except BaseException:
                df_r = pandas2ri.pandas2ri(df)

        mod = self.model_string

        m = mgcv.gam(ro.r(mod),
                     data=df_r,
                     family='gaussian()',
                     optimizer='perf')

        self._model = m
        self._linear_model = lin_model
Beispiel #50
0
 nfly=0
 for i,gtypeData in enumerate([dC, dEx1, dEx2]):
     gType = gTypes[i]
     for j, gtData in enumerate(gtypeData):
         if j>0:
             for k,data in enumerate(gtData):
                 if data!='':
                     prmValDict[dfLabels[0]].append(float(data))
                 else:
                     prmValDict[dfLabels[0]].append(np.nan)
                 prmValDict[dfLabels[1]].append(lbls[k])
                 prmValDict[dfLabels[2]].append(nfly)
                 prmValDict[dfLabels[3]].append(gType)
             nfly+=1
 df = pd.DataFrame(prmValDict, columns=dfLabels)
 descStats = pd.DataFrame(pandas2ri.ri2py(fsa.Summarize(statsFormula, data = pandas2ri.py2ri(df))))
 descStatsLabels = list(descStats.columns)
 if 'nvalid' in descStatsLabels:
     descStatsLabels.pop(descStatsLabels.index('nvalid'))
     descStatsLabels.append('nvalid')
 descStats = descStats[descStatsLabels]
 print descStats
 ll = nparLD.f1_ld_f1(pandas2ri.py2ri(df['result']),
                      pandas2ri.py2ri(df['label']),\
                      pandas2ri.py2ri(df['genotype']),\
                      pandas2ri.py2ri(df['flyNumber']),
                      **{'description': 'FALSE',\
                          'plot_RTE':'FALSE',
                          'order.warning':'FALSE',
                          })
 multiAnova = r_matrix_to_data_frame(ll.rx2('ANOVA.test'), True).round(5)
Beispiel #51
0
# SPeicher als R data frame
from rpy2.robjects import pandas2ri
from rpy2.robjects import r
#import pandas.rpy.common as com
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

df_ts_index = pd.read_pickle('DataFrameOutput_pull_ts_comments_comp.pkl')

#http://stackoverflow.com/questions/11586582/save-2d-numpy-array-to-r-file-format-using-rpy2

R_df = pandas2ri.py2ri(df_ts_index)
r.assign("GH_data_df_pull_comp", R_df)
r("save(GH_data_df_pull_comp, file='RObject_pull_comp_comments.gzip', compress=TRUE)"
  )
def RunWDModel(recent):
    pandas2ri.activate()
    # R package names
    packnames = ('forecast')
    if all(rpackages.isinstalled(x) for x in packnames):
        have_tutorial_packages = True
    else:
        have_tutorial_packages = False
    if not have_tutorial_packages:
        # import R's utility package
        utils = rpackages.importr('utils')
        # select a mirror for R packages
        utils.chooseCRANmirror(ind=1)  # select the first mirror in the list
    if not have_tutorial_packages:
        # R vector of strings
        from rpy2.robjects.vectors import StrVector
        # file
        packnames_to_install = [
            x for x in packnames if not rpackages.isinstalled(x)
        ]
        if len(packnames_to_install) > 0:
            utils.install_packages(StrVector(packnames_to_install))
    # Import R packages
    forecast = importr('forecast')
    base = importr('base')
    # Model subset of data for particular station
    jump = 3  # skip weekend
    output = pd.DataFrame(columns=['count_diff', 'DateTime', 'Type', 'ID'])
    absent = []
    errors = []
    done = False
    for x in station_range:
        SepModel = Model(recent, x)
        if SepModel.valid is False:
            absent.append(x)
            continue
        SepModel.PreProcess(separate=True)
        # Ensure series length is only 2 Weeks
        SepModel.WD = SepModel.WD[:20 * 48]
        if (len(SepModel.WD) > 0):
            if done is False:  # Only needs to be run once
                done = True
                WD_dates = SepModel.WD.index
                y = np.asarray(WD_dates[-1].year, dtype='datetime64[Y]') - 1970
                doy = np.asarray((WD_dates[-1].dayofyear + jump),
                                 dtype='timedelta64[D]') - 1
                new = pd.to_datetime(y + doy)
                new_dates = pd.DatetimeIndex(start=new,
                                             freq='30Min',
                                             periods=48 * 4)
            SepModel.WD.reset_index(inplace=True, drop=True)
            gc.collect()
            robjects.r('o = c(2,0,1)')
            robjects.r('sorder = c(1,1,2)')
            robjects.r('s = list(order=sorder, period=48)')
            DF = pandas2ri.py2ri(SepModel.WD)
            robjects.r.assign('df', DF)
            try:
                robjects.r('fit = Arima(df,order=o, seasonal=s, method="CSS")')
            except:
                errors.append(x)
                continue
            f_cast = robjects.r('f_cast = forecast(fit, h=4*48)')
            arima_mean = np.array(f_cast.rx('mean'))
            robjects.r('rm(list = ls(all = TRUE))')
            robjects.r('gc()')
            results = pd.DataFrame({
                'count_diff': arima_mean.flatten()
            }).round()
            results.count_diff = results.count_diff.astype(int)
            results['DateTime'] = new_dates
            results['Type'] = 'Forecast'
            results['ID'] = x
            SepModel.WD['DateTime'] = WD_dates
            SepModel.WD['Type'] = 'Historic'
            SepModel.WD['ID'] = x
            out = SepModel.WD.append(results)
            output = output.append(out)
            del f_cast
            del DF
            del SepModel
            gc.collect()
    output.ID = output.ID.astype(int)
    output.count_diff = output.count_diff.astype(int)
    output.reset_index(inplace=True, drop=True)
    path = wd + '\Model'
    if not os.path.exists(path):
        os.mkdir(path)
    output.to_csv(path + '\\' 'WDModelOutput.csv')
    return output, absent, errors
Beispiel #53
0
def main():
    '''
    maine
    '''

    # Command Line Stuff...
    myCommandLine = CommandLine()

    outdir     = myCommandLine.args['outDir']
    group1     = myCommandLine.args['group1']
    group2     = myCommandLine.args['group2']
    batch      = myCommandLine.args['batch']  
    matrix     = myCommandLine.args['matrix']
    prefix     = myCommandLine.args['prefix']
    formula    = myCommandLine.args['formula']




    # make the quant DF
    quantDF  = pd.read_table(matrix, header=0, sep='\t', index_col=0)
    df = pandas2ri.py2ri(quantDF)
    #print(df.head())
    # import formula
    formulaDF     = pd.read_csv(formula,header=0, sep="\t",index_col=0)
    sampleTable = pandas2ri.py2ri(formulaDF)

    if "batch" in list(formulaDF):
        design = Formula("~ batch + condition")
    else:
        design = Formula("~ condition")
    #print(sampleTable)

    # import DESeq2
    from rpy2.robjects.packages import importr
    import rpy2.robjects.lib.ggplot2 as ggplot2
    methods   = importr('methods')
    deseq     = importr('DESeq2')
    grdevices = importr('grDevices')
    qqman     = importr('qqman')



    dds = deseq.DESeqDataSetFromMatrix(countData = df,
                                        colData = sampleTable,
                                        design = design)

    dds  = deseq.DESeq(dds)
    cont = robjects.r["grep"]("condition",robjects.r['resultsNames'](dds),value=True)
    #print(cont)
    # get results; orient the results for groupA vs B
    res = deseq.results(dds, name=cont)
    # results with shrinkage
    resLFC = deseq.lfcShrink(dds, coef=cont, type="apeglm")
    resdf  = robjects.r['as.data.frame'](res)
    
    R.assign('res', res)
    
    reslfc  = robjects.r['as.data.frame'](resLFC)

    # plot MA and PC stats for the user
    plotMA    = robjects.r['plotMA']
    plotDisp  = robjects.r['plotDispEsts']
    plotPCA   = robjects.r['plotPCA']
    plotQQ    = robjects.r['qq']
    
    vsd       = robjects.r['vst'](dds, blind=robjects.r['F'])
    # get pca data
    if "batch" in list(formulaDF):
        pcaData    = plotPCA(vsd, intgroup=robjects.StrVector(("condition", "batch")), returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    else:
        print(vsd)
        pcaData    = plotPCA(vsd, intgroup="condition", returnData=robjects.r['T'])
        percentVar = robjects.r['attr'](pcaData, "percentVar")
    # arrange 
    grdevices.pdf(file="./%s/%s_QCplots_%s_v_%s.pdf" % (outdir,prefix,group1,group2))


    x = "PC1: %s" % int(percentVar[0]*100) + "%% variance"
    y = "PC2: %s" % int(percentVar[1]*100) + "%% variance"

    if "batch" in list(formulaDF):
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition", shape="batch") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
        pp.plot()
    else:
        pp = ggplot2.ggplot(pcaData) + \
                ggplot2.aes_string(x="PC1", y="PC2", color="condition") + \
                ggplot2.geom_point(size=3) + \
                robjects.r['xlab'](x) + \
                robjects.r['ylab'](y) + \
                ggplot2.theme_classic() + \
                ggplot2.coord_fixed()
        pp.plot()
    plotMA(res, ylim=robjects.IntVector((-3,3)), main="MA-plot results")
    #plotMA(res, main="MA-plot results")
    plotMA(resLFC, ylim=robjects.IntVector((-3,3)), main="MA-plot LFCSrrhinkage")
    #plotMA(resLFC, main="MA-plot LFCSrrhinkage")
    plotQQ(resdf.rx2('pvalue'), main="pvalue QQ")
    plotQQ(reslfc.rx2('pvalue'), main="LFCSrhinkage pvalue QQ")
    hh = ggplot2.ggplot(resdf) + \
            ggplot2.aes_string(x="pvalue") + \
            ggplot2.geom_histogram() + \
            ggplot2.theme_classic() 
    hh.plot()
    plotDisp(dds, main="Dispersion Estimates")
    grdevices.dev_off()


    lfcOut =  "./%s/%s_%s_v_%s_deseq2_results_shrinkage.tsv" % (outdir,prefix,group1,group2)
    resOut =  "./%s/%s_%s_v_%s_deseq2_results.tsv" % (outdir,prefix,group1,group2)

    robjects.r['write.table'](reslfc, file=lfcOut, quote=False, sep="\t")
    robjects.r['write.table'](resdf, file=resOut, quote=False, sep="\t")
def fit_model_death_rates(df):
    '''Fit model for the death rates and extract coefficients.

       Parameters
       ----------
       df: dataframe
        Contains columns of the log of death rates, sdi,
        age_group_id, region_id, year_id

       Returns
       -------
       feff_df: dataframe
        Contains the draws of coefficients of fixed effects.
       reff_df: dataframe
        Contains the draws of coefficients of random effects.
    '''

    rpy2.robjects.globalenv['data'] = pandas2ri.py2ri(df)
    # Fit linear mixed-effects model in R, fixed effects on sdi, random effects
    # on age_group_id and region_id.
    rpy2.robjects.r('''
        model = lmer(log_death_rate ~ 1 + sdi + (1|age_group_id) + (1|region_id), data, REML=F)
        capture.output(summary(model),
            file = paste("/ihme/forecasting/data/disaster/outputs/disaser_model.txt", sep=""))

        feff_means = fixef(model)
        reff_means = ranef(model)
        feff_var = attr(vcov(model), "x")
        reff_var = VarCorr(model)
    ''')
    # Extract the coefficients of fixed effects of the model.
    feffects = ['sdi']
    param_list = ['intercept'] + feffects
    fixed_means = pandas2ri.ri2py(rpy2.robjects.globalenv['feff_means'])
    fixed_var = np.reshape(np.array(rpy2.robjects.globalenv['feff_var']),
                           (len(param_list), len(param_list)))
    # Draw random samples from a multivariate normal distribution.
    feff_df = pd.DataFrame(np.random.multivariate_normal( \
                            fixed_means, fixed_var, 1000), \
                            index=np.arange(1000), \
                            columns=[param_list])

    # Extract the coefficients of random effects of the model
    reffects = ['age_group_id', 'region_id']
    reff_means = {}
    reff_vars = {}
    reff_df = {}

    for r in reffects:
        rpy2.robjects.r('''
            reff_means_df = data.frame(reff_means${r})
            reff_means_df['{r}_label'] = row.names(reff_means_df)
        '''.format(r=r))
        reff_means[r] = pandas2ri.ri2py(rpy2.robjects.r['reff_means_df'])
        reff_means[r] = reff_means[r].rename(columns={'X.Intercept.': \
                                                    'mean_{}'.format(r)})

        reff_vars[r] = np.array(rpy2.robjects \
                         .r('attr(reff_var${r}, "stddev")'.format(r=r)))[0]
        reff_means[r]['se_{}'.format(r)] = reff_vars[r]
        # Draw random samples from a normal distribution.
        reff_df[r] = pd.DataFrame(np.random.normal(
                                     reff_means[r]['mean_{}'.format(r)],
                                     reff_means[r]['se_{}'.format(r)],
                                     size=(1000, len(reff_means[r]))),
                                     columns=reff_means[r]['{}_label'.format(r)]) \
                        .transpose() \
                        .reset_index()
    return feff_df, reff_df
Beispiel #55
0
def applymem(df):
    rdf = pandas2ri.py2ri(df)
    seasons = sort(list(df.columns.drop(['UF', 'isoweek'])))[:-1]
    # Discard 2009 season if present:
    seasons = sorted(set(seasons).difference(['SRAG2009']))
    rseasons = ro.StrVector(seasons)
    ro.globalenv['df'] = rdf
    ro.globalenv['seasons'] = rseasons
    # # Method for obtaining typical time series evolution (default 2)
    # ro.globalenv['par.type.curve'] = 2
    # # Method for obtaining pre/post-epidemic threshold (default 4)
    # ro.globalenv['par.type.threshold'] = 2
    # # Method for obtaining intensity thresholds (default 4)
    # ro.globalenv['par.type.intensity'] = 2
    # # Method for obtaining outbreak start and length (default 6)
    # ro.globalenv['par.type.other'] = 2
    # # Total number of points to obtain pre/post-threshold (will take n/seasons from each)
    # ro.globalenv['par.n.max'] = 30
    # # Confidence interval for modelled curve
    # ro.globalenv['par.level.curve'] = 0.90
    # # Confidence interval for pre/post-thresold
    # ro.globalenv['par.level.threshold'] = 0.95
    # # Quantiles for intensity thresholds
    # ro.globalenv['par.level.intensity'] = ro.FloatVector([0.40, 0.90, 0.975])
    #
    # epimemrslt = ro.r('memmodel(i.data=subset(df, select=seasons), i.type.curve=par.type.curve,' +
    #                   'i.type.threshold=par.type.threshold, i.type.intensity=par.type.intensity,' +
    #                   'i.type.other=par.type.other, i.n.max=par.n.max, i.level.curve=par.level.curve,' +
    #                   'i.level.threshold=par.level.threshold, i.level.intensity=par.level.intensity)')

    ro.globalenv['df'] = rdf
    ro.globalenv['seasons'] = rseasons
    ro.globalenv['par.type.curve'] = 2
    ro.globalenv['par.n.max'] = 20
    ro.globalenv['par.level.curve'] = 0.90
    ro.globalenv['par.level.threshold'] = 0.90

    epimemrslt = ro.r(
        'memmodel(i.data=subset(df, select=seasons), i.type.curve=par.type.curve,'
        +
        'i.n.max=par.n.max, i.level.curve=par.level.curve, i.level.threshold=par.level.threshold)'
    )

    # Pre-epidemic threshold:
    epithreshold = pandas2ri.ri2py_dataframe(
        epimemrslt.rx2('pre.post.intervals')).loc[0, 2]
    typrealcurve = pandas2ri.ri2py_dataframe(epimemrslt.rx2('typ.real.curve'))

    # Check for seasons below threshold:
    dropseasons = set()
    for s in seasons:
        if df[s].max() < epithreshold:
            dropseasons.add(s)
    # Drop seasons below threshold and rerun algorithm:
    episeasons = list(seasons)
    if len(dropseasons) > 0 and len(dropseasons) < len(seasons):
        episeasons = sorted(list(set(seasons).difference(dropseasons)))
        ro.globalenv['episeasons'] = ro.StrVector(episeasons)

        # epimemrslt = ro.r('memmodel(i.data=subset(df, select=episeasons), i.type.curve=par.type.curve,' +
        #                   'i.type.threshold=par.type.threshold, i.type.intensity=par.type.intensity,' +
        #                   'i.type.other=par.type.other, i.n.max=par.n.max, i.level.curve=par.level.curve,' +
        #                   'i.level.threshold=par.level.threshold, i.level.intensity=par.level.intensity)')

        epimemrslt = ro.r(
            'memmodel(i.data=subset(df, select=episeasons), i.type.curve=par.type.curve,'
            +
            'i.n.max=par.n.max, i.level.curve=par.level.curve, i.level.threshold=par.level.threshold)'
        )

    # Store results in python dictionary of objects
    pyepimemrslt = {}
    rovector = [
        ro.vectors.StrVector, ro.vectors.IntVector, ro.vectors.FloatVector,
        ro.vectors.Vector
    ]
    for name in epimemrslt.names:
        rdata = epimemrslt.rx2(name)
        if name == 'call':
            pyepimemrslt.update({name: str(rdata)})
        elif type(rdata) in rovector:
            pyepimemrslt.update({name: pandas2ri.ri2py_vector(rdata)})
        else:
            pyepimemrslt.update({name: pandas2ri.ri2py_dataframe(rdata)})

    # typ.curve is the typical curve obtained from averaging over epimemic seasons with time rescaled
    # so that the start of the epidemic period coincides with mean.start
    pyepimemrslt['typ.curve'].rename(columns={
        0: 'baixo',
        1: 'mediano',
        2: 'alto'
    },
                                     inplace=True)
    pyepimemrslt['typ.curve']['mediano'].fillna(0, inplace=True)
    pyepimemrslt['typ.curve']['baixo'] = pyepimemrslt['typ.curve'][
        'baixo'].where(pyepimemrslt['typ.curve']['baixo'] >= 0, other=0)
    pyepimemrslt['typ.curve']['baixo'] = pyepimemrslt['typ.curve']['baixo'].\
        where( (-pyepimemrslt['typ.curve']['baixo'].isnull()), other=pyepimemrslt['typ.curve']['mediano'])
    pyepimemrslt['typ.curve']['alto'] = pyepimemrslt['typ.curve']['alto'].\
        where((-pyepimemrslt['typ.curve']['alto'].isnull()), other=pyepimemrslt['typ.curve']['mediano'])
    pyepimemrslt['pre.post.intervals'].rename(index={
        0: 'pre',
        1: 'post'
    },
                                              inplace=True)

    # typ.real.curve is the typical curve without time shift, that is, respecting the original weeks from data
    # this curve is better to keep all seasons, not only the epidemic ones.
    pyepimemrslt['typ.real.curve'] = typrealcurve.copy()
    pyepimemrslt['typ.real.curve'].rename(columns={
        0: 'baixo',
        1: 'mediano',
        2: 'alto'
    },
                                          inplace=True)
    pyepimemrslt['typ.real.curve']['mediano'].fillna(0, inplace=True)
    pyepimemrslt['typ.real.curve']['baixo'] = pyepimemrslt['typ.real.curve']['baixo'].\
        where(pyepimemrslt['typ.real.curve']['baixo']>=0, other=0)
    pyepimemrslt['typ.real.curve']['baixo'] = pyepimemrslt['typ.real.curve']['baixo'].\
        where( (-pyepimemrslt['typ.real.curve']['baixo'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano'])
    pyepimemrslt['typ.real.curve']['alto'] = pyepimemrslt['typ.real.curve']['alto'].\
        where((-pyepimemrslt['typ.real.curve']['alto'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano'])
    newcols = {}
    for k, v in enumerate(episeasons):
        newcols[k] = str(v) + ' transladado'
    pyepimemrslt['moving.epidemics'].rename(columns=newcols, inplace=True)

    return pyepimemrslt
Beispiel #56
0
def quantile(dataset):

    return pandas2ri.ri2py(r.quantile(pandas2ri.py2ri(
        dataset['rawdata']))).set_index('gene_symbol')
counts_mat /= 1000000
counts_mat = counts_mat.round()
counts_mat = counts_mat.astype(int)
print(counts_mat.shape)
print("breaking up counts mat...")
n_chunks = 10
step_size = counts_mat.shape[0] // n_chunks
for i in range(n_chunks):
    start_idx = i * step_size
    end_idx = (i + 1) * step_size
    if i == n_chunks - 1:
        end_idx = counts_mat.shape[0]
    chunk = counts_mat[start_idx:end_idx]
    print("chunk {}, {}".format(i, chunk.shape))
    print("converting to r objects")
    chunk = pandas2ri.py2ri(chunk)
    print("converted")
    r_object_name = "chunk_{}.mat".format(i)
    r.assign(r_object_name, chunk)
    print("assigned")
    r("save({}, file='counts_{}_mat.gzip', compress=TRUE)".format(
        r_object_name, i))
    print("saved")
    print()

# new_store = pd.HDFStore('selected_counts.h5')
# new_store['counts'] = counts_mat
# new_store['accessions'] = accessions
# new_store['gene_symbols'] = gene_symbols
# new_store['labels'] = labels
# new_store.close()
Beispiel #58
0
def gc_coverage_plot(contigs_file,
                     contig_depth_table=False,
                     samtool_depth_file=False,
                     blast_file=False,
                     column1=1,
                     column2=2,
                     main=False,
                     highlight=False,
                     taxonomy_file=False,
                     output_prefix=False):

    if output_prefix[-1] != '/':
        output_prefix += '/'
    print("output_prefix", output_prefix)

    import os
    import shell_command
    import rpy2.robjects as robjects
    import rpy2.robjects.numpy2ri
    from pandas import DataFrame
    import pandas
    import rpy2
    from rpy2.robjects import r
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()

    if not main:
        main = os.path.basename(contigs_file)

    out, err, code = shell_command.shell_command(
        "infoseq -auto -only -Name -length -pgc %s > /tmp/gc.tab" %
        contigs_file)

    #print (out)
    #print (err)
    #print (code)

    if contig_depth_table:

        contig_depth = pandas.read_csv(contig_depth_table,
                                       sep='\t',
                                       names=["contig", "depth"])
        #contig_depth = DataFrame(contig_depth, columns=['contig', 'depth'])
        #print (type(contig_depth["contig"]))
        #print (type(contig_depth))
        robjects.r.assign('contigs_depth', pandas2ri.py2ri(contig_depth))

    if taxonomy_file:
        with open(taxonomy_file, 'r') as f:
            contigs2taxon2count = {}

            for row in f:
                data = row.rstrip().split()
                contig = data[0]
                taxon = data[1]
                if contig not in contigs2taxon2count:
                    contigs2taxon2count[contig] = {}
                    contigs2taxon2count[contig][taxon] = 1
                else:
                    if taxon in contigs2taxon2count[contig]:
                        contigs2taxon2count[contig][taxon] += 1
                    else:
                        contigs2taxon2count[contig][taxon] = 1
        contig2label = []
        for contig in contigs2taxon2count:
            if len(contigs2taxon2count[contig]) > 1:
                # more than one taxon
                label = ''
                for taxon in contigs2taxon2count[contig]:
                    label += '%s (%s) /' % (taxon,
                                            contigs2taxon2count[contig][taxon])
                label = label[0:-2]
            else:
                label = list(contigs2taxon2count[contig].keys())[0]
            contig2label.append([contig, label])
        label2freq = {}
        for contig in contig2label:
            if contig[1] not in label2freq:
                label2freq[contig[1]] = 1
            else:
                label2freq[contig[1]] += 1
        for contig in contig2label:
            if label2freq[contig[1]] <= 2:
                contig[1] = 'rare_taxon'

        df = DataFrame(contig2label, columns=['contig', 'label'])
        print(type(df["contig"]))
        print(type(df))
        #m = m.astype(float)
        robjects.r.assign('contig_labels', pandas2ri.py2ri(df))
    else:
        robjects.r.assign('contig_labels', False)

    if highlight:
        highlight_code = """
        gc_coverage_table$color <- rep(rgb(1, 0, 0,0.5), length(gc_coverage_table[,1]))
        highlight_table <- read.table("%s", header=FALSE)
        m <- match(highlight_table[,1], gc_coverage_table$Name)
        gc_coverage_subset <- gc_coverage_table[m,]
        print("subset")
        print(m)
        gc_coverage_table[m,]$color<-rgb(0, 0, 1,0.5)

        """ % highlight

        highlight_code2 = """

        m <- match(highlight_table[,1], gc_coverage_table_2m$Name)
        #print("subset m2")
        #print(m)
        gc_coverage_subset2 <- gc_coverage_table_2m[m,]

        """

    else:
        highlight_code = ''
        highlight_code2 = ''

    if not blast_file:
        robjects.r("""

        #library(Cairo)
        library(R.utils)
        library(ggplot2)




        if (exists("contigs_depth")==FALSE){

            if (isGzipped("%s")){
                #print('Gzipped file')
                all_depth <- read.table(gzfile('%s'), header=FALSE)
            }else{
                #print('Not Gzipped')
                all_depth <- read.table('%s', header=FALSE)
            }

            contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median)
            colnames(contigs_depth) <- c('contig', 'depth')
        }
        #print(contigs_depth)
        #print(contig_labels)
        contigs_gc <- read.table("/tmp/gc.tab", header=TRUE)

        gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$contig),2])
        #w<-which(gc_coverage_table$Length >=1000)
        #gc_coverage_table <- gc_coverage_table[w,]

         cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4]
         #print('cov biggest:')
         #print(cov_biggest)
         w <- which(gc_coverage_table[,4]< (4*cov_biggest))
         gc_coverage_table_2m <- gc_coverage_table[w,]

        if (contig_labels != FALSE) {
            library(RColorBrewer)
            color_palette <- c('red', 'blue','green', brewer.pal(12,"Paired"), brewer.pal(12,"Set3"))
            m <- match(contig_labels$contig, gc_coverage_table$Name)

            gc_coverage_table$color <- rep("Unclassified", length(gc_coverage_table[,1]))
            gc_coverage_table$contig_alpha <- rep(0.5, length(gc_coverage_table[,1]))

            gc_coverage_table$color[m] <- as.character(contig_labels$label)
            gc_coverage_table$contig_alpha[m] <- rep(1,length(contig_labels$label))

            w<-which(gc_coverage_table$Length >=1000)
            gc_coverage_table <- gc_coverage_table[w,]
            #w2 <- which(gc_coverage_table$color != "Chlamydiae")
            #gc_coverage_table$contig_alpha[w2] <- 0.7

            svg("%sgc_cov_buble_test.svg", width = 12, height = 12)
            p6 <- ggplot(gc_coverage_table, aes(x = X.GC, y = coverage, size = Length, fill = color, colour = color, alpha = contig_alpha)) +
                    geom_point(shape = 21) +
                    ggtitle("Scaffold GC vs Depth") +
                    labs(x = "GC (%%)", y = "Sequencing depth") +
                    scale_size(range = c(1, 10))
            p6 <- p6 + scale_fill_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])+ guides(color = guide_legend(override.aes = list(size=5)))
            p6 <- p6 + scale_colour_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])
            #print (max(gc_coverage_table$Length))
            p6 <- p6 + scale_alpha_continuous(range=c(0.1, 1), limits=c(0.1,1)) #+ scale_alpha_continuous(range=c(0, max(gc_coverage_table$Length)), limits=c(0,max(gc_coverage_table$Length)))

            print(p6 + theme_bw())
            dev.off()

            gc_coverage_table_2m$color <- rep("Unclassified", length(gc_coverage_table_2m[,1]))
            gc_coverage_table_2m$contig_alpha <- rep(0.5, length(gc_coverage_table_2m[,1]))

            gc_coverage_table_2m$color[m] <- as.character(contig_labels$label)
            gc_coverage_table_2m$contig_alpha[m] <- rep(1,length(contig_labels$label))

            svg("%sgc_cov_buble_test_2m.svg", width = 12, height = 12)
            p6 <- ggplot(gc_coverage_table_2m, aes(x = X.GC, y = coverage, size = Length, fill = color, colour = color, alpha = contig_alpha)) +
                    geom_point(shape = 21) +
                    ggtitle("Scaffold GC vs Depth") +
                    labs(x = "GC (%%)", y = "Sequencing depth") +
                    scale_size(range = c(1, 10))
            p6 <- p6 + scale_fill_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])+ guides(color = guide_legend(override.aes = list(size=5)))
            p6 <- p6 + scale_colour_manual(values=color_palette[0:length(unique(gc_coverage_table$color))])
            #print (max(gc_coverage_table$Length))
            p6 <- p6 + scale_alpha_continuous(range=c(0.1, 1), limits=c(0.1,1)) #+ scale_alpha_continuous(range=c(0, max(gc_coverage_table$Length)), limits=c(0,max(gc_coverage_table$Length)))

            print(p6 + theme_bw())
            dev.off()



        }else{
            #print('NO contig_labels')

        }

        write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F)

    %s

     svg("%sgc_cov_buble.svg", width = 12, height = 12)
         symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3, ann=T,
                 bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth")
         if (any("gc_coverage_subset" %%in%% ls())) {
             symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3,
                     ann=T, bg=gc_coverage_table$color, fg=gc_coverage_table$color, add = TRUE)
             l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset$Name)
             text(x=gc_coverage_subset[,3], y=gc_coverage_subset[,4], labels = l)
         }else{
            print ('a')
         }

         dev.off()




         %s

         svg("%sgc_cov_buble_2m.svg", width = 12, height = 12)
            symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2],
                    inches=1/3, ann=T, bg=rgb(1, 0, 0,0.5), fg=rgb(1, 0, 0,0.5), main="%s", xlab="GC(%%)", ylab="Sequencing depth")

            if (any("gc_coverage_subset" %%in%% ls())) {

                symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2],
                        inches=1/3, ann=T, bg=gc_coverage_table_2m$color, fg=gc_coverage_table_2m$color, add = TRUE)
                l <- gsub('(^[^_]+_[^_]+)_(.*)$', '\\\\1', gc_coverage_subset2$Name)
                text(x=gc_coverage_subset2[,3], y=gc_coverage_subset2[,4], labels = l)
            }else{
                print ('a')
            }

     dev.off()



                   """ %
                   (samtool_depth_file, samtool_depth_file, samtool_depth_file,
                    output_prefix, output_prefix, highlight_code,
                    output_prefix, main, highlight_code2, output_prefix, main))
    else:

        robjects.r("""

        #library(Cairo)
        library(R.utils)

        if (isGzipped("%s")){
            #print('Gzipped file')
            all_depth <- read.table(gzfile('%s'), header=FALSE)
        }else{
            #print('Not Gzipped')
            all_depth <- read.table('%s', header=FALSE)
        }

        blast_file <- read.table("%s", header=FALSE, sep="\t")[,c(2,6)]
        contigs_depth<- aggregate(all_depth["V3"],b=all_depth[c("V1")],FUN=median)
        contigs_gc <- read.table("/tmp/gc.tab", header=TRUE)

        gc_coverage_table <-cbind(contigs_gc,coverage=contigs_depth[match(contigs_gc$Name, contigs_depth$V1),2])
        #w<-which(gc_coverage_table$Length >=1000)
        #gc_coverage_table <- gc_coverage_table[w,]

        gc_coverage_table$taxon <- blast_file[,2][match(gc_coverage_table$Name, blast_file[,1])]
        #print (is.na(gc_coverage_table$taxon))
        gc_coverage_table$taxon <- as.character(gc_coverage_table$taxon)
        gc_coverage_table$taxon[is.na(gc_coverage_table$taxon)] <- 'undefined'
        gc_coverage_table$taxon <- as.factor(gc_coverage_table$taxon)

        write.table(gc_coverage_table, 'gc_coverage_table.tab', sep="\t", row.names=F)

         svg("gc_cov_buble.svg", width = 12, height = 12,)
            symbols(x=gc_coverage_table[,3], y= gc_coverage_table[,4], circles=gc_coverage_table[,2], inches=1/3,
                    ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth")
         dev.off()

         cov_biggest <- gc_coverage_table[which(gc_coverage_table$Length==max(gc_coverage_table$Length)),4]
         #print('cov biggest:')
         #print(cov_biggest)
         w <- which(gc_coverage_table[,4]< (4*cov_biggest))
         gc_coverage_table_2m <- gc_coverage_table[w,]

         svg("gc_cov_buble_2m.svg", width = 12, height = 12,)
            symbols(x=gc_coverage_table_2m[,3], y= gc_coverage_table_2m[,4], circles=gc_coverage_table_2m[,2],
                    inches=1/3, ann=F, bg=gc_coverage_table$taxon, fg=gc_coverage_table$taxon, main="%s", xlab="GC(%%)", ylab="Sequencing depth")
         dev.off()



                   """ % (samtool_depth_file, samtool_depth_file,
                          samtool_depth_file, blast_file, main, main))
Beispiel #59
0
def main(group1=None, group2=None, outDir=None, inDir=None, formula=None):
    '''
    main
    '''

    R.assign('inDir', inDir)
    R.assign('outdir', outDir)

    R.assign('group1', group1)
    R.assign('group2', group2)

    print("Running DeSeq2....")
    print(group1 + " vs " + group2)

    # import
    from rpy2.robjects.packages import importr
    #kallisto processing libraries
    tximportData = importr('tximportData')
    tximport = importr('tximport')
    ensembldb = importr('ensembldb')
    EnsDb_Hsapiens_v86 = importr('EnsDb.Hsapiens.v86')
    #deseq
    methods = importr('methods')
    deseq = importr('DESeq2')

    #transcripts to gene, used in tximport
    R('edb <- EnsDb.Hsapiens.v86')
    R('tx2gene = transcripts(edb , columns=c("tx_id", "gene_name"),return.type="DataFrame")'
      )

    # import formula
    formulaDF = pd.read_csv(formula, header=0, sep="\t")

    samples = formulaDF.samples.tolist()
    R.assign('samples', samples)

    sampleTable = pandas2ri.py2ri(formulaDF)
    R.assign('sampleTable', sampleTable)

    #locate kallisto files
    #would be faster to use kallito abundance.h5 files
    R('files <- file.path(inDir, samples, "abundance.tsv")')
    R('all(file.exists(files))')

    #tximport conversion to gene
    R('txi.kallisto <- tximport(files, type = "kallisto",tx2gene = tx2gene, txOut = FALSE,ignoreTxVersion=TRUE)'
      )
    R('rownames(sampleTable) <- samples')

    #DESeq
    R('dds <- DESeqDataSetFromTximport(txi.kallisto, sampleTable, ~condition)')

    # R('colData(dds)$condition<-factor(colData(dds)$condition, levels=c(group1,group2))')

    R('dds_<-DESeq(dds)')
    R('res<-results(dds_)')
    R('res<-res[order(res$padj),]')

    # writing deseq2 results to a file
    Out = os.path.join(outDir, "%s_v_%s_deseq2_results.csv" % (group1, group2))
    R.assign('Out', Out)

    R('write.csv(as.data.frame(res),file=Out)')
Beispiel #60
0
split = 95
# Showing the outlier
fig, ax1 = plt.subplots()
plt.plot(data['Adj Close'], 'k', linewidth=0.5)
plt.title(f'{Ticker} Stock Price')
ax1.axvspan(data.index[-split] - timedelta(days=+5),
            data.index[-1],
            alpha=0.2,
            color='red')
plt.show()
# This is where the break is data.index[-84]
#plt.plot(data[:-75]); plt.plot(data)

# ModelR0 is before the Outlier
datatoR0 = data['Adj Close'][:-84]
r_dataframe = pandas2ri.py2ri(datatoR0)
modelR0 = autoarima(r_dataframe)
print(coef(modelR0))

# ModelR1 is After the Outlier
datatoR1 = data['Adj Close']
r_dataframe = pandas2ri.py2ri(datatoR1)
modelR1 = autoarima(r_dataframe)
print(coef(modelR1))

SIMLEN = 30
SIMTIM = 4000  # I will generate 1 with normal rets and 1 with out of sample returns. total is before the division.

montecarlo0 = np.transpose([
    np.array(asnumeric(simulate(modelR0, nsim=SIMLEN))) for i in range(SIMTIM)
])