コード例 #1
0
def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2):
    """"
    Invoke a named "correlation" R metric and convert the R dataframe result into
    a Pandas dataframe.
    :param r_stream1: an r_stream object
    :param r_stream2: an r_stream object
    :param metric_function_name: the name of the set of metrics
    :return:
    """
    R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric')
    
    # NOTE:  Conversion of dataframes only works if you activate but we don't want conversion
    # NOTE:  to always be automatic so we deactivate() after we're done converting.
    pandas2ri.activate()
    r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1)
    r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2)
    pandas2ri.deactivate()
    
    # TODO:  Can we just activate/deactivate before/after R_function() without converting
    # TODO:  r_evalresp1/2 ahead of time?
    
    # Calculate the metric
    r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2)
    r_dataframe = _R_metricList2DF(r_metriclist)
    pandas2ri.activate()
    df = pandas2ri.ri2py_dataframe(r_dataframe)
    pandas2ri.deactivate()
    
    # Convert columns from R POSIXct to pyton UTCDateTime
    df.starttime = df.starttime.apply(UTCDateTime)
    df.endtime = df.endtime.apply(UTCDateTime)
    return df
コード例 #2
0
ファイル: limma.py プロジェクト: mfiers/rat
def run_simple(A, B):

    from rpy2.robjects import pandas2ri
    from rpy2.robjects.packages import importr
    import rpy2.robjects as ro
    r = ro.r

    pandas2ri.activate()
    
    limma = importr('limma')
    edgeR = importr('edgeR')

    counts = pd.concat([A, B], 1)
    groups = r.factor(r.c(*([0] * A.shape[1] + [1] * B.shape[1])))
    ro.globalenv['exp'] = groups
                 
    design = r('model.matrix(~exp)')
    dge = r.DGEList(counts=counts)
    dge = r.calcNormFactors(dge)
    v = r.voom(dge, design, plot=False)
    fit = r.lmFit(v, design)
    fit = r.eBayes(fit)
    tt = r.topTable(fit, coef=r.ncol(design), number=1e12)
    ttidx = r['row.names'](tt)
    tt =  pandas2ri.ri2py(tt)
    cols = tt.columns.to_series()
    cols[0] = 'lfc'
    cols[3] = 'pval'
    cols[4] = 'padj'
    tt.columns = cols
    tt['slp'] = np.log10(tt['pval'])
    tt.loc[tt['lfc'] > 0, 'slp'] = -np.log10(tt.loc[tt['lfc'] > 0, 'pval'])
    tt.index = ttidx
    return tt
コード例 #3
0
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False):
    """Makes a call to DESeq2 with SCRAN to
    perform D.E.A. in the given
    counts matrix with the given conditions and comparisons.
    Returns a list of DESeq2 results for each comparison
    """
    results = list()
    n_cells = len(counts.columns)
    try:
        pandas2ri.activate()
        deseq2 = RimportLibrary("DESeq2")
        scran = RimportLibrary("scran")
        multicore = RimportLibrary("BiocParallel")
        multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))
        as_matrix = r["as.matrix"]
        # Create the R conditions and counts data
        r_counts = pandas2ri.py2ri(counts)
        cond = robjects.StrVector(conds)
        r_call = """
            function(r_counts) {
                sce = SingleCellExperiment(assays=list(counts=r_counts))
                return(sce)
            }
        """
        r_func = r(r_call)
        sce = r_func(as_matrix(r_counts))
        if scran_clusters:
            r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10))
            min_cluster_size = min(Counter(r_clusters).values())
            sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]]))
            sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True)
        else:
            sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]]))
            sce = scran.computeSumFactors(sce, sizes=sizes, positive=True)   
        sce = r.normalize(sce)
        dds = r.convertTo(sce, type="DESeq2")
        r_call = """
            function(dds, conditions){
                colData(dds)$conditions = as.factor(conditions)
                design(dds) = formula(~ conditions)
                return(dds)
            }
        """
        r_func = r(r_call)
        dds = r_func(dds, cond)
        dds = r.DESeq(dds)
        # Perform the comparisons and store results in list
        for A,B in comparisons:
            result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha)
            result = r['as.data.frame'](result)
            genes = r['rownames'](result)
            result = pandas2ri.ri2py_dataframe(result)
            # There seems to be a problem parsing the rownames from R to pandas
            # so we do it manually
            result.index = genes
            results.append(result)
        pandas2ri.deactivate()
    except Exception as e:
        raise e
    return results
コード例 #4
0
def computeSumFactors(counts, scran_clusters=True):
    """ Compute normalization factors
    using the deconvolution method
    described in Marioni et al.
    Returns the computed size factors as a vector.
    :param counts: a matrix of counts (genes as rows)
    :return returns the normalization factors a vector
    """
    n_cells = len(counts.columns)
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    scran = RimportLibrary("scran")
    as_matrix = r["as.matrix"]
    if scran_clusters and n_cells >= 50:
        r_clusters = scran.quickCluster(as_matrix(r_counts),
                                        min(n_cells/10, 10),
                                        method="igraph")
        min_cluster_size = min(Counter(r_clusters).values())
        sizes = list(range(min(int(min_cluster_size/4), 10), 
                           min(int(min_cluster_size/2), 50), 5))
        dds = scran.computeSumFactors(as_matrix(r_counts), 
                                      clusters=r_clusters, sizes=sizes)
    else:
        sizes = list(range(min(int(n_cells/4), 10), 
                           min(int(n_cells/2), 50), 5))
        dds = scran.computeSumFactors(as_matrix(r_counts), sizes=sizes)        
    pandas_sf = pandas2ri.ri2py(dds)
    pandas2ri.deactivate()
    return pandas_sf
def read_rdata(rdata_fullpath, table_name):
    """
    Returns the pandas DataFrame
    """
    from rpy2.robjects import pandas2ri, r
    pandas2ri.activate()

    # we want forward slashes for R
    rdata_fullpath_forR = rdata_fullpath.replace("\\", "/")
    print "Loading %s" % rdata_fullpath_forR
    
    # read in the data from the R session with python
    r['load'](rdata_fullpath_forR)
    # check that it's there
    table_df = pandas2ri.ri2py(r['model_summary'])

    # fillna
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0: print "  Found %5d NA values in column %s" % (nullcount, col)
    table_df = table_df.fillna(0)
    for col in table_df.columns:
        nullcount = sum(pandas.isnull(table_df[col]))
        if nullcount > 0: print "  -> Found %5d NA values in column %s" % (nullcount, col)
    
    print "Read %d lines from %s" % (len(table_df), rdata_fullpath)
    return table_df
コード例 #6
0
def logCountsWithFactors(counts, size_factors):
    """ Uses the R package scater to log a matrix of counts (genes as rows)
    and a vector of size factor using the method normalize().
    :param counts: a matrix of counts (genes as rows)
    :param size_factors: a vector of size factors
    :return the normalized log counts (genes as rows)
    """
    columns = counts.columns
    indexes = counts.index
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    scater = RimportLibrary("scran")
    r_call = """
        function(counts, size_factors){
          sce = SingleCellExperiment(assays=list(counts=as.matrix(counts)))
          sizeFactors(sce) = size_factors
          sce = normalize(sce)
          norm_counts = logcounts(sce)
          return(as.data.frame(norm_counts))
        }
    """
    r_func = r(r_call)
    r_norm_counts = r_func(r_counts, size_factors)
    pandas_norm_counts = pandas2ri.ri2py(r_norm_counts)
    pandas_norm_counts.index = indexes
    pandas_norm_counts.columns = columns
    pandas2ri.deactivate()
    return pandas_norm_counts
コード例 #7
0
 def testActivate(self):
     robjects.conversion.py2ri = robjects.default_py2ri
     self.assertNotEqual(rpyp.pandas2ri, robjects.conversion.py2ri)
     rpyp.activate()
     self.assertEqual(rpyp.pandas2ri, robjects.conversion.py2ri)
     rpyp.deactivate()
     self.assertEqual(robjects.default_py2ri, robjects.conversion.py2ri)
コード例 #8
0
 def testSeries(self):
     Series = pandas.core.series.Series
     s = Series(numpy.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
     rpyp.activate()
     rp_s = robjects.conversion.py2ri(s)
     rpyp.deactivate()
     self.assertEqual(rinterface.FloatSexpVector, type(rp_s))
コード例 #9
0
ファイル: __init__.py プロジェクト: BioXiao/cgat
def dtwWrapper(data, rows, columns, k):
    '''
    wrapper function for dynamic time warping.
    includes use of exponential adaptive tuning function
    with temporal correlation if k > 0
    '''

    # not explicitly called, but needs to be in R environment
    DTW = importr("dtw")

    # create a data frame of zeros of size number of ids x number of ids
    # fill it with the calculated distance metric for each pair wise comparison

    df_ = pd.DataFrame(index=rows,
                       columns=columns)
    df_ = df_.fillna(0.0).astype(np.float64)

    # fill the array with dtw-distance values
    pandas2ri.activate()

    for i in rows:
        E.info("DTW %s" % i)
        for j in columns:
            series1 = data.loc[i].values.tolist()
            series2 = data.loc[j].values.tolist()
            DTW_value = (R.dtw(series1,
                               series2)).rx('distance')[0][0]
            cort_value = temporalCorrelate(series1, series2)
            tuned_value = adaptiveTune(cort_value, k)
            time_dist = DTW_value * tuned_value
            df_.loc[i][j] = float(time_dist)
            df_[j][i] = float(time_dist)

    return df_
コード例 #10
0
ファイル: event.py プロジェクト: jsignell/rain-gage-tools
    def variogram(self, i=0, plot_v=True, **kwargs):
        """
        Generate a variogram

        Parameters
        ----------
		self : Event object with at least one data column
        i : int data column index number (defaults to 0)
		plot_v : bool generate a plot of the variogram

        **kwargs (target_np, alpha, tol_hor, max_bnd, last_max)

        Returns
        -------
        v : Dataframe containing output from r-variogram function
        """
        from rpy2.robjects import pandas2ri
        pandas2ri.activate()
        rfuncs = import_r_tools()
        
        if 'X' not in self.ll_cols:
            self.set_ll()
            
        df = self.df
        cols = self.data_cols
        
        r_df = df.loc[:,['X', 'Y', cols[i]]].dropna(how='any')
        v = pandas2ri.ri2py(rfuncs.get_iSVG(r_df, 3, **kwargs))
        if plot_v:
            v.plot(x='dist', y='gamma', marker = 'o', figsize=(8,4))
        return v
コード例 #11
0
def computeMnnBatchCorrection(counts):
    """Computes batch correction to a list of batches (data frames)
    where each data frame represents a batch (animal for instance).
    The batch correction is computed using Scran::mnnCorrect()
    from Marioni et al.
    :param counts: a list of matrices of counts
    :return returns a list of batch corrected matrices of counts
    """
    pandas2ri.activate()
    as_matrix = r["as.matrix"]
    meta = [(x.index,x.columns) for x in counts]
    r_counts = [as_matrix(pandas2ri.py2ri(x)) for x in counts]
    RimportLibrary("scran")
    r_call = """
        function(counts) {
           norm_counts = do.call(mnnCorrect, c(counts, cos.norm.out=FALSE));
           return(lapply(norm_counts$corrected, as.data.frame))
        }
    """
    r_func = r(r_call)
    norm_counts = list()
    for i,x in enumerate(r_func(r_counts)):
        norm_c = pandas2ri.ri2py(x)
        norm_c.index = meta[i][0]
        norm_c.columns = meta[i][1]
        norm_counts.append(norm_c)
    pandas2ri.deactivate()
    return norm_counts
コード例 #12
0
ファイル: event.py プロジェクト: jsignell/rain-gage-tools
    def krige(self, i=0, v=None, step=1, res=True, plot_v=False, plot_k=True, animated=False, **plot_kwargs):
        """
        Krige the dataframe with a single data column or a column index number

        Parameters
        -------
		self : Event object with at least one data column
		
		kwargs
		-------
        i : int data column index number (defaults to 0)
        v : variogram to use in determining sill and range
        step : grid interval to krige on (in km)
		res : bool detrend points before computing kriged values - default True
		plot_v : bool plot variogram - default False
		plot_k : bool plot kriged values - default True
		animated : bool return axis for animation - default False

        **plot_kwargs (cmap, s, latlon, basemap, shpfile, POT, locs, colors)

        Returns
        -------
        k : Dataframe containing output from r-krige function
        """
        from rpy2.robjects import pandas2ri
        pandas2ri.activate()
        rfuncs = import_r_tools()
        
        if 'X' not in self.ll_cols:
            self.set_ll()
        
        if res:
            if not hasattr(self, 'res'):
                self.detrend()
            df = self.res
        else:
            df = self.df
        cols = self.data_cols
        
        r_df = df.loc[:,['X', 'Y', cols[i]]].dropna(how='any')
        if not v:
            v = pandas2ri.ri2py(rfuncs.get_variogram(r_df))

        model = 'Sph'
        psill = r_df.var()[cols[i]]
        for j in range(len(v)):
            if v.gamma[j] > psill:
                rng = v.dist[j]
                break
        k = pandas2ri.ri2py(rfuncs.get_krige(r_df, psill, model, rng, step=step))
        k['lat'] = k.y/110.574
        k['lon'] = k.x/(111.320*(k['lat']*pi/180).apply(cos))
        self.k = k
        if plot_k and animated:
            return self.plot_krige(i, k, rng, step=step, res=res, animated=animated, **plot_kwargs)
        elif plot_k and not animated:
            self.plot_krige(i, k, rng, step=step, res=res, animated=animated, **plot_kwargs)
        else:
            return k
コード例 #13
0
ファイル: databox.py プロジェクト: jsignell/point-process
    def get_features(self, d={}, thresh=.01, sigma=3, min_size=4, const=5, return_dict=False, buffer=False):
        '''
        Use r package SpatialVx to identify features.

        Parameters
        ----------
        thresh: .01
        sigma: 3
        min_size: 4
        const: 5
        buffer: False

        Return
        ------
        p: pd.Panel containing parameters characterizing the features found
        '''
        from rpy2 import robjects
        from rpy2.robjects.packages import importr
        from rpy2.robjects import pandas2ri
        pandas2ri.activate()
        SpatialVx = importr('SpatialVx')
        rsummary = robjects.r.summary
        r_tools = import_r_tools()

        ll = np.array([self.lon.flatten('F'), self.lat.flatten('F')]).T
        for i in range(self.box.shape[0]-1):
            hold = SpatialVx.make_SpatialVx(self.box[i,:,:], self.box[i+1,:,:], loc=ll)
            look = r_tools.FeatureFinder_gaussian(hold, nx=self.box.shape[2], ny=self.box.shape[1],
                                                  thresh=thresh, smoothpar=sigma, **(dotvars(min_size=min_size)))
            try:
                x = rsummary(look, silent=True)[0]
            except:
                continue
            px = pandas2ri.ri2py(x)
            df0 = pd.DataFrame(px, columns=['centroidX', 'centroidY', 'area', 'OrientationAngle',
                                          'AspectRatio', 'Intensity0.25', 'Intensity0.9'])
            df0['Observed'] = list(df0.index+1)
            m = SpatialVx.centmatch(look, criteria=3, const=const)
            p = pandas2ri.ri2py(m[12])
            df1 = pd.DataFrame(p, columns=['Forecast', 'Observed'])
            l = SpatialVx.FeatureMatchAnalyzer(m)
            try:
                p = pandas2ri.ri2py(rsummary(l, silent=True))
            except:
                continue
            df2 = pd.DataFrame(p, columns=['Partial Hausdorff Distance','Mean Error Distance','Mean Square Error Distance',
                                          'Pratts Figure of Merit','Minimum Separation Distance', 'Centroid Distance',
                                          'Angle Difference','Area Ratio','Intersection Area','Bearing', 'Baddeleys Delta Metric',
                                          'Hausdorff Distance'])
            df3 = df1.join(df2)

            d.update({self.time[i]: pd.merge(df0, df3, how='outer')})
        if return_dict:
            return(d)
        p = pd.Panel(d)
        if buffer:
            return(self.add_buffer(p))
        return(p)
コード例 #14
0
ファイル: __init__.py プロジェクト: CGATOxford/cgat
def conditionDESeq2(data_frame, header, alpha, res_dir):
    '''
    Perform DESeq2-based analysis of condition:time interaction
    dependent differential expression
    '''

    E.info("Differential expression testing for %s" % header)
    cols = data_frame.columns

    # py2ri requires activation
    pandas2ri.activate()
    counts = pandas2ri.py2ri(data_frame)

    des_times = ro.IntVector([x.split(".")[1] for x in cols])
    des_reps = ro.StrVector([x.split(".")[2] for x in cols])
    des_cond = ro.StrVector([x.split(".")[0] for x in cols])
    genes = ro.StrVector([x for x in data_frame.index])

    # setup counts table and design frame

    R('''suppressPackageStartupMessages(library("DESeq2"))''')
    R('''sink(file="/dev/null")''')
    R('''times <- as.factor(%s)''' % des_times.r_repr())
    R('''reps <- c(%s)''' % des_reps.r_repr())
    R('''condition <- c(%s)''' % des_cond.r_repr())
    R('''design <- data.frame(times, reps, condition)''')
    R('''counts <- data.frame(%s)''' % counts.r_repr())
    R('''genes <- c(%s)''' % genes.r_repr())
    R('''rownames(counts) <- genes''')
    R('''rownames(design) <- colnames(counts)''')

    # use DESeq() with LRT and reduced formula.  Use effect
    # size moderation

    R('''dds <- DESeqDataSetFromMatrix(countData=counts, '''
      '''colData=design, '''
      '''design=~reps + times + condition + times:condition)''')
    R('''dds <- DESeq(dds, test="LRT", '''
      '''reduced=~reps + times + condition, betaPrior=T)''')
    R('''res <- results(dds)[order(results(dds)$padj, na.last=T), ]''')
    R('''res.df <- data.frame(res)''')

    # generate dispersion and MA plots
    R('''png("%s/%s-dispersions.png")''' % (res_dir,
                                            header))
    R('''plotDispEsts(dds)''')
    R('''dev.off()''')

    R('''png("%s/%s-MAplot.png")''' % (res_dir,
                                       header))
    R('''plotMA(res, alpha=%0.3f, ylim=c(-5,5))''' % alpha)
    R('''dev.off()''')
    R('''sink(file=NULL)''')

    df = pandas2ri.ri2py(R['res.df'])

    return df
コード例 #15
0
ファイル: common.py プロジェクト: jsignell/point-process
def import_r_tools(filename='r-tools.R'):
    from rpy2.robjects import pandas2ri, r, globalenv
    from rpy2.robjects.packages import STAP
    pandas2ri.activate()
    path = os.path.dirname(os.path.realpath(__file__))
    with open(os.path.join(path, filename), 'r') as f:
        string = f.read()
    rfuncs = STAP(string, "rfuncs")
    return rfuncs
コード例 #16
0
ファイル: __init__.py プロジェクト: CGATOxford/cgat
def treeCutting(infile,
                expression_file,
                cluster_file,
                cluster_algorithm,
                deepsplit=False):
    '''
    Use dynamic tree cutting to derive clusters for each
    resampled distance matrix
    '''
    wgcna_out = "/dev/null"

    E.info("loading distance matrix")

    df = pd.read_table(infile, sep="\t",
                       header=0, index_col=0)
    df = df.fillna(0.0)
    genes = df.index
    genes_r = ro.StrVector([g for g in genes])

    # py2ri requires activation
    pandas2ri.activate()
    rdf = pandas2ri.py2ri(df)

    R.assign("distance_data", rdf)
    R.assign("gene_ids", genes_r)

    R('''sink(file='%(wgcna_out)s')''' % locals())
    R('''suppressPackageStartupMessages(library("WGCNA"))''')
    R('''suppressPackageStartupMessages(library("flashClust"))''')
    E.info("clustering data by %s linkage" % cluster_algorithm)
    R('''rownames(distance_data) <- gene_ids''')
    R('''clustering <- flashClust(as.dist(distance_data),'''
      ''' method='%(cluster_algorithm)s')''' % locals())
    if deepsplit:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=T)''')
    else:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''minClusterSize=50, deepSplit=F)''')

    R('''color_cut <- labels2colors(cluster_cut)''')
    R('''write.table(color_cut, file = '%(cluster_file)s','''
      '''sep="\t")''' % locals())
    R('''cluster_matched <- data.frame(cbind(rownames(distance_data),'''
      '''color_cut))''')
    R('''colnames(cluster_matched) = c("gene_id", "cluster")''')
    R('''cluster_matched <- data.frame(cluster_matched$gene_id,'''
      '''cluster_matched$cluster)''')
    R('''sink(file=NULL)''')

    cluster_frame = pandas2ri.ri2py(R["cluster_matched"])
    cluster_frame.columns = ['gene_id', 'cluster']
    cluster_frame.index = cluster_frame['gene_id']
    cluster_frame.drop(['gene_id'], inplace=True, axis=1)

    return cluster_frame
コード例 #17
0
 def testActivate(self):
     #FIXME: is the following still making sense ?
     self.assertNotEqual(rpyp.py2ri, robjects.conversion.py2ri)
     l = len(robjects.conversion.py2ri.registry)
     k = set(robjects.conversion.py2ri.registry.keys())
     rpyp.activate()
     self.assertTrue(len(conversion.py2ri.registry) > l)
     rpyp.deactivate()
     self.assertEqual(l, len(conversion.py2ri.registry))
     self.assertEqual(k, set(conversion.py2ri.registry.keys()))
コード例 #18
0
 def testRi2pandas(self):
     rdataf = robjects.r('data.frame(a=1:2, b=I(c("a", "b")), c=c("a", "b"))')
     rpyp.activate()
     pandas_df = robjects.conversion.ri2py(rdataf)
     rpyp.deactivate()
     self.assertIsInstance(pandas_df, pandas.DataFrame)
     self.assertEquals(('a', 'b', 'c'), tuple(pandas_df.keys()))
     self.assertEquals(pandas_df['a'].dtype, numpy.dtype('int32'))
     self.assertEquals(pandas_df['b'].dtype, numpy.dtype('O'))
     self.assertEquals(pandas_df['c'].dtype, numpy.dtype('O'))
コード例 #19
0
 def testSeries_issue264(self):
     Series = pandas.core.series.Series
     s = Series(('a', 'b', 'c', 'd', 'e'),
                index=pandas.Int64Index([0,1,2,3,4]))
     rpyp.activate()
     rp_s = robjects.conversion.py2ri(s)
     rpyp.deactivate()
     # segfault before the fix
     str(rp_s)
     self.assertEqual(rinterface.ListSexpVector, type(rp_s))
コード例 #20
0
ファイル: analysis.py プロジェクト: jfnavarro/st_analysis
def computeNClusters(counts, min_size=20):
    """Computes the number of clusters
    from the data using Scran::quickCluster"""
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts.transpose())
    scran = RimportLibrary("scran")
    as_matrix = r["as.matrix"]
    clusters = scran.quickCluster(as_matrix(r_counts), min_size, method="igraph")
    n_clust = len(set(clusters))
    pandas2ri.deactivate()
    return n_clust
コード例 #21
0
ファイル: functions.py プロジェクト: huangduan2018/hants
def Kriging_Interpolation_Array(input_array, x_vector, y_vector):
    """
    Interpolate data in an array using Ordinary Kriging

    Reference: https://cran.r-project.org/web/packages/automap/automap.pdf
    """
    # Total values in array
    n_values = np.isfinite(input_array).sum()
    # Load function
    pandas2ri.activate()
    robjects.r('''
                library(gstat)
                library(sp)
                library(automap)
                kriging_interpolation <- function(x_vec, y_vec, values_arr,
                                                  n_values){
                  # Parameters
                  shape <- dim(values_arr)
                  counter <- 1
                  df <- data.frame(X=numeric(n_values),
                                   Y=numeric(n_values),
                                   INFZ=numeric(n_values))
                  # Save values into a data frame
                  for (i in seq(shape[2])) {
                    for (j in seq(shape[1])) {
                      if (is.finite(values_arr[j, i])) {
                        df[counter,] <- c(x_vec[i], y_vec[j], values_arr[j, i])
                        counter <- counter + 1
                      }
                    }
                  }
                  # Grid
                  coordinates(df) = ~X+Y
                  int_grid <- expand.grid(x_vec, y_vec)
                  names(int_grid) <- c("X", "Y")
                  coordinates(int_grid) = ~X+Y
                  gridded(int_grid) = TRUE
                  # Kriging
                  krig_output <- autoKrige(INFZ~1, df, int_grid)
                  # Array
                  values_out <- matrix(krig_output$krige_output$var1.pred,
                                       nrow=length(y_vec),
                                       ncol=length(x_vec),
                                       byrow = TRUE)
                  return(values_out)
                }
                ''')
    kriging_interpolation = robjects.r['kriging_interpolation']
    # Execute kriging function and get array
    r_array = kriging_interpolation(x_vector, y_vector, input_array, n_values)
    array_out = np.array(r_array)
    # Return
    return array_out
コード例 #22
0
def _zeros_from_weather

if __name__ == '__main__':
    pandas2ri.activate()
    # підключаю бази даних:
    con_in = sqlite3.connect('clear_takeoff_test.db')
    con_out = sqlite3.connect('Data.db')
    con_test = sqlite3.connect('test.db')
    # prelude2_sql(con_test)
    # prelude1_Rdata(con_test)
    # prelude3_sql(con_test)
    prelude4_sql(con_test)
コード例 #23
0
def pandas_load(name):
    '''
    loads .rdata file (R dataframe file) and returns it as Pandas dataframe.
    :param name: .rdata filename (eg: 'subset.Rdata')
    :return: pandas dataframe object
    '''
    pandas2ri.activate()
    r.load(name)  # name = 'subset.fcuk.Rdata'
    # name_without_ext = r['.'.join(name.split('.')[-2::-1][::-1])]
    # print(r.ls())  # ls() - list of active objects in R env
    df = pandas2ri.ri2py(r[r.ls()[0]])
    return df
コード例 #24
0
 def testRi2pandas_issue207(self):
     d = robjects.DataFrame({'x': 1})
     rpyp.activate()
     try:
         ok = True
         robjects.globalenv['d'] = d
     except ValueError:
         ok = False
     finally:
         rpyp.deactivate()
         if 'd' in robjects.globalenv:
             del(robjects.globalenv['d'])
     self.assertTrue(ok)
コード例 #25
0
ファイル: rmagic.py プロジェクト: hansenrl/cs249-2
def load_ipython_extension(ip):
    """Load the extension in IPython."""
       
    if pandas2ri:
        pandas2ri.activate()
    else:
        numpy2ri.activate()

    ip.register_magics(RMagics)
    # Initialising rpy2 interferes with readline. Since, at this point, we've
    # probably just loaded rpy2, we reset the delimiters. See issue gh-2759.
    if ip.has_readline:
        ip.readline.set_completer_delims(ip.readline_delims)
コード例 #26
0
def computeNClusters(counts, min_size=20):
    """Computes the number of clusters
    from the data using Scran::quickCluster"""
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts.transpose())
    scran = RimportLibrary("scran")
    multicore = RimportLibrary("BiocParallel")
    multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))  
    as_matrix = r["as.matrix"]
    clusters = scran.quickCluster(as_matrix(r_counts), min_size)
    n_clust = len(set(clusters))
    pandas2ri.deactivate()
    return n_clust
コード例 #27
0
ファイル: rmagic.py プロジェクト: gyenney/Tools
def load_ipython_extension(ip):
    """Load the extension in IPython."""

    if hasattr(baseconversion, 'activate'):
        # This is pandas2ri if pandas is installed,
        # or numpy2ri otherwise
        baseconversion.activate()

    ip.register_magics(RMagics)
    # Initialising rpy2 interferes with readline. Since, at this point, we've
    # probably just loaded rpy2, we reset the delimiters. See issue gh-2759.
    if ip.has_readline:
        ip.readline.set_completer_delims(ip.readline_delims)
コード例 #28
0
def run_eblink(tmp, tmp_dir, column_types, a, b, iterations, filenum, numrecords):
    '''
    Provides an interface with R to run ebLink in the background through R.
    '''
    pandas2ri.activate()
    # Get base packages
    # Import data to link
    data = ro.r('read.csv(file = "{}", header = T)'.format(tmp))
    # Set necessary variables
    ## X.c contains the categorical variables
    ## X.s contains the string variables
    ## p.c is the number of categorical variables
    ## p.s contains the number of string variables
    matrix = ro.r['as.matrix']
    c_cols = [x for x in column_types if column_types[x].upper() == 'C']
    xc = matrix(data[c_cols])
    s_cols = [x for x in column_types if column_types[x].upper() == 'S']
    xs = matrix(data[s_cols])
    pc = ro.IntVector([len(filter(lambda x: x == 'C', column_types.values()))])
    ps = ro.IntVector([len(filter(lambda x: x == 'S', column_types.values()))])
    # Number of iterations
    g = ro.IntVector([iterations])
    # Number of entries in file
    m = ro.IntVector([numrecords])
    # File number identifier
    fn = ro.IntVector(filenum)
    # Subjective choices for distortion probability prior
    a = ro.IntVector([a])
    b = ro.IntVector([b])
    # Steepness parameter; pre-set to recommended value
    c = ro.IntVector([STEEPNESS])
    # Edit distance function; can be swapped for others if desired
    ro.r("d <- function(string1,string2){adist(string1,string2)}")
    d = ro.r['d']
    # Loads in Gibbs sampler and plyr packages
    eb_pack = ro.r("source('{}', chdir = TRUE)".format(find('ebGibbsSampler.R', '../../')))
    plyr = importr("plyr")
    # Move to tmp directory to save results file
    os.chdir(tmp_dir)
    print 'Running the gibbs sampler...'
    # Runs the gibbs sampler
    gibbs = ro.r['rl.gibbs']
    lam = gibbs(file_num = fn, X_s = xs, X_c=xc, num_gs=g, a=a, b=b, c=c, d=d, M=m)
    os.chdir('..')
    # Calculate estimated population sizes by finding number of uniques
    appl = ro.r['apply']
    ro.r("len_uniq <- function(x){length(unique(x))}")
    len_uniq = ro.r['len_uniq']
    estPopSize = appl(lam, 1, len_uniq)

    return np.array(lam), np.array(estPopSize)
コード例 #29
0
ファイル: limma.py プロジェクト: mfiers/rat
def run2(counts, formula, normcounts = None):

    from rpy2.robjects import pandas2ri
    from rpy2.robjects.packages import importr
    import rpy2.robjects as ro
    r = ro.r

    pandas2ri.activate()
    
    limma = importr('limma')
    edgeR = importr('edgeR')


    design_matrix = counts.T.reset_index()[counts.columns.names]
    ro.globalenv['design.matrix'] = design_matrix
    design = r('as.data.frame(model.matrix(' + formula + ', data=design.matrix))')

    dge = r.DGEList(counts=counts)
    dge = r.calcNormFactors(dge)
    v = r.voom(dge, design, plot=False)
    ro.globalenv['v'] = v
    if not normcounts is None:
        r('write.table(v, "' + normcounts + '",sep="\t",quote = F,col.names = NA)')
        
    fit = r.lmFit(v, design)
    fit = r.eBayes(fit)

    rv = []

    print(r.ncol(design)[0])
    for i in range(1, r.ncol(design)[0]):
        colname = r.colnames(design)[i]
        tt = r.topTable(fit, coef=i, number=1e12)
        ttidx = r['row.names'](tt)
        tt =  pandas2ri.ri2py(tt)
        cols = tt.columns.to_series()
        cols[0] = 'lfc'
        cols[3] = 'pval'
        cols[4] = 'padj'
        tt.columns = cols
        tt['slp'] = np.log10(tt['pval'])
        tt.loc[tt['lfc'] > 0, 'slp'] = -np.log10(tt.loc[tt['lfc'] > 0, 'pval'])
        if r.ncol(design)[0] > 2:
            #prepend colname to columns - only if there are more factors
            cols = tt.columns.to_series().apply(lambda x: '{}_{}'.format(colname, x))
            tt.columns = cols
        tt.index = ttidx

        rv.append(tt)
    return pd.concat(rv, axis=1)
コード例 #30
0
def computeRLEFactors(counts):
    """ Compute normalization size factors
    using the RLE method described in EdgeR and returns then as a vector.
    :param counts: a matrix of counts (genes as rows)
    :return returns the normalization factors a vector
    """
    pandas2ri.activate()
    r_counts = pandas2ri.py2ri(counts)
    edger = RimportLibrary("edgeR")
    as_matrix = r["as.matrix"]
    dds = edger.calcNormFactors(as_matrix(r_counts), method="RLE")
    pandas_sf = pandas2ri.ri2py(dds)
    pandas_cm = pandas2ri.ri2py(r.colSums(counts))
    pandas2ri.deactivate()
    return pandas_sf * pandas_cm
コード例 #31
0
ファイル: ml.py プロジェクト: nehak0601/gramex
def r(code=None,
      path=None,
      rel=True,
      conda=True,
      convert=True,
      repo='https://cran.microsoft.com/',
      **kwargs):
    '''
    Runs the R script and returns the result.

    :arg str code: R code to execute.
    :arg str path: R script path. Cannot be used if code is specified
    :arg bool rel: True treats path as relative to the caller function's file
    :arg bool conda: True overrides R_HOME to use the Conda R
    :arg bool convert: True converts R objects to Pandas and vice versa
    :arg str repo: CRAN repo URL

    All other keyword arguments as passed as parameters
    '''
    # Use Conda R if possible
    if conda:
        r_home = _conda_r_home()
        if r_home:
            os.environ['R_HOME'] = r_home

    # Import the global R session
    try:
        from rpy2.robjects import r, pandas2ri, globalenv
    except ImportError:
        app_log.error('rpy2 not installed. Run "conda install rpy2"')
        raise
    except RuntimeError:
        app_log.error('Cannot find R. Set R_HOME env variable')
        raise

    # Set a repo so that install.packages() need not ask for one
    r('local({r <- getOption("repos"); r["CRAN"] <- "%s"; options(repos = r)})'
      % repo)

    # Activate or de-activate automatic conversion
    # https://pandas.pydata.org/pandas-docs/version/0.22.0/r_interface.html
    if convert:
        pandas2ri.activate()
    else:
        pandas2ri.deactivate()

    # Pass all other kwargs as global environment variables
    for key, val in kwargs.items():
        globalenv[key] = val

    if code and path:
        raise RuntimeError('Use r(code=) or r(path=...), not both')
    if path:
        # if rel=True, load path relative to parent directory
        if rel:
            stack = inspect.getouterframes(inspect.currentframe(), 2)
            folder = os.path.dirname(os.path.abspath(stack[1][1]))
            path = os.path.join(folder, path)
        result = r.source(path, chdir=True)
        # source() returns a withVisible: $value and $visible. Use only the first
        result = result[0]
    else:
        result = r(code)

    return result
コード例 #32
0
def R_var_importance(nsamples=40000, data_store=None):
    base = importr('base')
    ###################################################
    # load dataframe
    store = pd.HDFStore(data_store)
    print(store)

    #pandas2ri.activate()
    Xtrain = store['Xtrain']
    ytrain = store['ytrain']

    #Xtest, Xtrain, ytrain, Xval, yval, test_idx, val_idx = prepareAllFeatures()

    #sample
    if nsamples != -1:
        if isinstance(nsamples, str) and 'shuffle' in nsamples:
            print("Shuffle train data...")
            rows = np.random.choice(len(Xtrain.index),
                                    size=len(Xtrain.index),
                                    replace=False)
        else:
            rows = np.random.choice(len(Xtrain.index),
                                    size=nsamples,
                                    replace=False)

        print("unique rows: %6.2f" %
              (float(np.unique(rows).shape[0]) / float(rows.shape[0])))
        Xtrain = Xtrain.iloc[rows, :]
        ytrain = ytrain.iloc[rows]

    store.close()
    pandas2ri.activate()
    print(Xtrain.info())
    print(Xtrain.describe(include='all'))
    Xtrain_R = pandas2ri.py2ri_pandasdataframe(Xtrain)
    ytrain_R = pandas2ri.py2ri_pandasseries(ytrain)
    #print Xtrain_R

    ###################################################
    # R-code
    # http://stackoverflow.com/questions/27801409/get-field-values-from-rpy2-random-forest-object
    r = robjects.r
    r['options'](warn=-1)

    r.library('randomForest')
    rf = r.randomForest(Xtrain_R,
                        ytrain_R,
                        ntree=250,
                        importance=True,
                        do_trace=1)
    df_imp_R = rf.rx("importance")
    df_imp_R = base.as_data_frame(df_imp_R)
    df_imp = pandas2ri.ri2py(df_imp_R)
    df_imp = df_imp.sort(columns=['importance.IncNodePurity'], ascending=False)
    print(df_imp)

    with pd.option_context('display.max_rows', 999, 'display.max_columns', 3):
        print(list(df_imp.index))

    #print r.dimnames(rf[8])
    r.varImpPlot(rf, sort=True, n_var=30)
コード例 #33
0
def load_rds(filename, types=None):
    import os
    import pandas as pd, numpy as np
    import rpy2.robjects as RO
    import rpy2.robjects.vectors as RV
    import rpy2.rinterface as RI
    from rpy2.robjects import numpy2ri
    numpy2ri.activate()
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()

    def load(data, types, rpy2_version=3):
        if types is not None and not isinstance(data, types):
            return np.array([])
        # FIXME: I'm not sure if I should keep two versions here
        # rpy2_version 2.9.X is more tedious but it handles BoolVector better
        # rpy2 version 3.0.1 converts bool to integer directly without dealing with
        # NA properly. It gives something like (0,1,-234235).
        # Possibly the best thing to do is to open an issue for it to the developers.
        if rpy2_version == 2:
            # below works for rpy2 version 2.9.X
            if isinstance(data, RI.RNULLType):
                res = None
            elif isinstance(data, RV.BoolVector):
                data = RO.r['as.integer'](data)
                res = np.array(data, dtype=int)
                # Handle c(NA, NA) situation
                if np.sum(np.logical_and(res != 0, res != 1)):
                    res = res.astype(float)
                    res[res < 0] = np.nan
                    res[res > 1] = np.nan
            elif isinstance(data, RV.FactorVector):
                data = RO.r['as.character'](data)
                res = np.array(data, dtype=str)
            elif isinstance(data, RV.IntVector):
                res = np.array(data, dtype=int)
            elif isinstance(data, RV.FloatVector):
                res = np.array(data, dtype=float)
            elif isinstance(data, RV.StrVector):
                res = np.array(data, dtype=str)
            elif isinstance(data, RV.DataFrame):
                res = pd.DataFrame(data)
            elif isinstance(data, RV.Matrix):
                res = np.matrix(data)
            elif isinstance(data, RV.Array):
                res = np.array(data)
            else:
                # I do not know what to do for this
                # But I do not want to throw an error either
                res = str(data)
        else:
            if isinstance(data, RI.NULLType):
                res = None
            else:
                res = data
        if isinstance(res, np.ndarray) and res.shape == (1, ):
            res = res[0]
        return res

    def load_dict(res, data, types):
        '''load data to res'''
        names = data.names if not isinstance(data.names, RI.NULLType) else [
            i + 1 for i in range(len(data))
        ]
        for name, value in zip(names, list(data)):
            if isinstance(value, RV.ListVector):
                res[name] = {}
                res[name] = load_dict(res[name], value, types)
            else:
                res[name] = load(value, types)
        return res

    #
    if not os.path.isfile(filename):
        raise IOError('Cannot find file ``{}``!'.format(filename))
    rds = RO.r['readRDS'](filename)
    if isinstance(rds, RV.ListVector):
        res = load_dict({}, rds, types)
    else:
        res = load(rds, types)
    return res
コード例 #34
0
    def analyze(
            self,
            ground_truth: np.array = None,
            r_home: str = "",
            r_path: str = r"",
            alpha: float = 0.05,
    ) -> Tuple[pd.DataFrame, Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]]:
        """
        Analyzes results from R script for SCDC from scdney packege.
        It is assumed that the effect on the first cell type is significant, all others are not.

        Parameters
        ----------
        ground_truth
            binary array for comparison to ground truth
        r_home
            path to R installation on your machine, e.g. "C:/Program Files/R/R-4.0.3"
        r_path
            path to R executable on your machine, e.g. "C:/Program Files/R/R-4.0.3/bin/x64"
        alpha
            p-value cutoff


        Returns
        -------
        summary and classification results

        Tuple
            Tuple(raw summary from R, True positive...)
        """

        os.environ["R_HOME"] = r_home
        os.environ["PATH"] = r_path + ";" + os.environ["PATH"]

        if ground_truth is None:
            ground_truth = np.zeros(self.k)

        import rpy2.robjects as rp
        from rpy2.robjects import numpy2ri, pandas2ri
        numpy2ri.activate()
        pandas2ri.activate()

        r_summary = rp.r(f"""
            library(scdney)
            library(tidyverse)
            library(broom.mixed)
            clust = scDC_noClustering({rp.vectors.StrVector(self.scdc_celltypes).r_repr()}, 
                                      {rp.vectors.StrVector(self.scdc_subject).r_repr()},
                                             calCI=TRUE,
                                             calCI_method=c("BCa"),
                                             nboot=100)

            glm = fitGLM(clust, {rp.vectors.StrVector(self.scdc_sample_cond).r_repr()}, pairwise=FALSE, subject_effect=FALSE)
            sum = summary(glm$pool_res_fixed)
            sum
            """)

        r_summary = pd.DataFrame(r_summary)

        p_values = r_summary.loc[r_summary["term"].str.contains("condCond_1"), "p.value"].values

        true_indices = np.where(ground_truth == True)[0]
        false_indices = np.where(ground_truth == False)[0]

        pval = np.nan_to_num(np.array(p_values), nan=1)
        tp = sum(pval[true_indices] < alpha)
        fn = sum(pval[true_indices] >= alpha)
        tn = sum(pval[false_indices] >= alpha)
        fp = sum(pval[false_indices] < alpha)

        return r_summary, (tp, tn, fp, fn)
コード例 #35
0
    def fit_model(
            self,
            method: str = "we.eBH",
            r_home: str = "",
            r_path: str = r"",
            *args,
            **kwargs
    ):
        """
        Fits ALDEx2 model.

        Parameters
        ----------
        method
            method that is used to calculate p-values (column name in ALDEx2's output)
        r_home
            path to R installation on your machine, e.g. "C:/Program Files/R/R-4.0.3"
        r_path
            path to R executable on your machine, e.g. "C:/Program Files/R/R-4.0.3/bin/x64"
        args
            passed to `ALDEx2.clr`
        kwargs
            passed to `ALDEx2.clr`

        Returns
        -------

        """

        os.environ["R_HOME"] = r_home
        os.environ["PATH"] = r_path + ";" + os.environ["PATH"]

        K = self.y.shape[1]

        if self.y.shape[0] == 2:
            p_val = [0 for _ in range(K)]
            self.result = None
        else:

            import rpy2.robjects as rp
            from rpy2.robjects import numpy2ri, pandas2ri
            numpy2ri.activate()
            pandas2ri.activate()
            import rpy2.robjects.packages as rpackages
            aldex2 = rpackages.importr("ALDEx2")

            x_fact = pd.factorize(self.x)[0]

            cond = rp.vectors.FloatVector(x_fact.astype("str").flatten().tolist())

            X_t = self.y.T
            nr, nc = X_t.shape
            X_r = rp.r.matrix(X_t, nrow=nr, ncol=nc)

            if "denom" in kwargs.keys():
                kwargs["denom"] = rp.vectors.FloatVector(kwargs["denom"])

            aldex_out = aldex2.aldex_clr(X_r, cond, *args, **kwargs)
            aldex_out = aldex2.aldex_ttest(aldex_out)
            aldex_out = pd.DataFrame(aldex_out)

            p_val = aldex_out.loc[:, method]

            self.result = aldex_out

        self.p_val = p_val
コード例 #36
0
def _quantile_normalize(job_context: Dict,
                        ks_check=True,
                        ks_stat=0.001) -> Dict:
    """
    Apply quantile normalization.

    """
    # Prepare our QN target file
    organism = job_context['organism']
    qn_target = utils.get_most_recent_qn_target_for_organism(organism)

    if not qn_target:
        logger.error(
            "Could not find QN target for Organism!",
            organism=organism,
            dataset_id=job_context['dataset'].id,
            dataset_data=job_context['dataset'].data,
            processor_job_id=job_context["job"].id,
        )
        job_context['dataset'].success = False
        job_context[
            'job'].failure_reason = "Could not find QN target for Organism: " + str(
                organism)
        job_context[
            'dataset'].failure_reason = "Could not find QN target for Organism: " + str(
                organism)
        job_context['dataset'].save()
        job_context['job'].success = False
        job_context[
            'failure_reason'] = "Could not find QN target for Organism: " + str(
                organism)
        return job_context
    else:
        qn_target_path = qn_target.sync_from_s3()
        qn_target_frame = pd.read_csv(qn_target_path,
                                      sep='\t',
                                      header=None,
                                      index_col=None,
                                      error_bad_lines=False)

        # Prepare our RPy2 bridge
        pandas2ri.activate()
        preprocessCore = importr('preprocessCore')
        as_numeric = rlang("as.numeric")
        data_matrix = rlang('data.matrix')

        # Convert the smashed frames to an R numeric Matrix
        # and the target Dataframe into an R numeric Vector
        target_vector = as_numeric(qn_target_frame[0])
        merged_matrix = data_matrix(job_context['merged_no_qn'])

        # Perform the Actual QN
        reso = preprocessCore.normalize_quantiles_use_target(
            x=merged_matrix, target=target_vector, copy=True)

        # Verify this QN, related: https://github.com/AlexsLemonade/refinebio/issues/599#issuecomment-422132009
        set_seed = rlang("set.seed")
        combn = rlang("combn")
        ncol = rlang("ncol")
        ks_test = rlang("ks.test")
        which = rlang("which")

        set_seed(123)

        n = ncol(reso)[0]
        m = 2
        if n >= m:
            combos = combn(ncol(reso), 2)

            # Convert to NP, Shuffle, Return to R
            ar = np.array(combos)
            np.random.shuffle(np.transpose(ar))
            nr, nc = ar.shape
            combos = ro.r.matrix(ar, nrow=nr, ncol=nc)

            # adapted from
            # https://stackoverflow.com/questions/9661469/r-t-test-over-all-columns
            # apply KS test to randomly selected pairs of columns (samples)
            for i in range(1, min(ncol(combos)[0], 100)):
                value1 = combos.rx(1, i)[0]
                value2 = combos.rx(2, i)[0]

                test_a = reso.rx(True, value1)
                test_b = reso.rx(True, value2)

                # RNA-seq has a lot of zeroes in it, which
                # breaks the ks_test. Therefore we want to
                # filter them out. To do this we drop the
                # lowest half of the values. If there's
                # still zeroes in there, then that's
                # probably too many zeroes so it's okay to
                # fail.
                median_a = np.median(test_a)
                median_b = np.median(test_b)

                # `which` returns indices which are
                # 1-indexed. Python accesses lists with
                # zero-indexes, even if that list is
                # actually an R vector. Therefore subtract
                # 1 to account for the difference.
                test_a = [test_a[i - 1] for i in which(test_a > median_a)]
                test_b = [test_b[i - 1] for i in which(test_b > median_b)]

                # The python list comprehension gives us a
                # python list, but ks_test wants an R
                # vector so let's go back.
                test_a = as_numeric(test_a)
                test_b = as_numeric(test_b)

                ks_res = ks_test(test_a, test_b)
                statistic = ks_res.rx('statistic')[0][0]
                pvalue = ks_res.rx('p.value')[0][0]

                job_context['ks_statistic'] = statistic
                job_context['ks_pvalue'] = pvalue

                # We're unsure of how strigent to be about
                # the pvalue just yet, so we're extra lax
                # rather than failing tons of tests. This may need tuning.
                if ks_check:
                    if statistic > ks_stat or pvalue < 0.8:
                        job_context['ks_warning'] = (
                            "Failed Kolmogorov Smirnov test! Stat: " +
                            str(statistic) + ", PVal: " + str(pvalue))
        else:
            logger.warning(
                "Not enough columns to perform KS test - either bad smash or single saple smash.",
                dset=job_context['dataset'].id)

        # And finally convert back to Pandas
        ar = np.array(reso)
        new_merged = pd.DataFrame(ar,
                                  columns=job_context['merged_no_qn'].columns,
                                  index=job_context['merged_no_qn'].index)
        job_context['merged_qn'] = new_merged
        merged = new_merged
    return job_context
コード例 #37
0
"""
This module is used to analyze the result of the executeR module
"""

import numpy as np
import pandas as pd
import pickle
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()

from executeR import DataframeStore

PICKLE_PATH = '../files/'

if __name__ == '__main__':
    rsnips = pickle.load(open(PICKLE_PATH + "r_dfs.pkl", "rb")).pairs
    # print(type(rdict))
    print(len(rsnips))
    # TODO check if all ndarrays were actually supposed to be Series; there might
    # have been a lost in translation for vectors (it might need to be explicit)
    uniques = set()
    count_results = 0
    count_errors = 0
    types = []
    for k in rsnips:
        expr = k['expr']
        out = k['test_results']
        # if type(v) == np.ndarray:
        if expr in uniques: continue
        errors = 0
コード例 #38
0
 def pd_active(self):
    pandas2ri.activate()
コード例 #39
0
def pythonWrapper4Pet(dataframe, snps, covars,
                      trait1, trait2, model1,
                      scriptsdir,
                      model2, resamples=999):
    '''
    This is just Python wrapper around the R code
    for the PET calculations

    '''
    py2ri.activate()

    E.info("Checking regression models")
    if model1 == "logistic":
        R('''trait1.mod <- binomial''')
        R('''trait1.link <- "logit" ''')
    elif model1 == "linear":
        R('''trait1.mod <- gaussian''')
        R('''trait1.link <- "identity" ''')

    if model2 == "logistic":
        R('''trait2.mod <- binomial''')
        R('''trait2.link <- "logit" ''')
    elif model2 == "linear":
        R('''trait2.mod <- gaussian''')
        R('''trait2.link <- "identity" ''')
    E.info("Running {} regression for trait 1: {}".format(model1,
                                                          trait1))
    E.info("Running {} regression for trait 2: {}".format(model2,
                                                          trait2))

    R('''source("%(scriptsdir)s/PET_functions.R")''' % locals())
    E.info("Pushing data objects into the R environment")
    # push everything into the R environment
    r_df = py2ri.py2ri_pandasdataframe(dataframe)
    R.assign("data.df", r_df)

    r_snps = ro.StrVector([sp for sp in snps])
    R.assign("snp.list", r_snps)

    E.info("Parsing covariates")
    covars = covars.split(",")
    r_covar = ro.StrVector([cv for cv in covars])
    R.assign("covar.list", r_covar)
    E.info("{} covariates found to adjust "
           "in regression  models".format(len(covars)))

    # clean up, replacing "missing values" with NAs for R
    R('''data.df[data.df == -9] <- NA''')
    R('''pet_results <- list()''')

    # loop over all SNP, calculate PCC and p-value
    # this takes a long time <- need to think of speed ups
    # possible Python-pure implementation, i.e. with LIMIX?
    E.info("Iteratively calculating PCC for all SNPs")
    R('''results <- loopPET(data.df=data.df, trait1="%(trait1)s", trait2="%(trait2)s", '''
      '''trait1.link=trait1.link, trait2.link=trait2.link, '''
      '''trait1.mod=trait1.mod, trait2.mod=trait2.mod, covars=covar.list,'''
      '''resamples=%(resamples)i, snp.list=snp.list)''' % locals())

    R('''out.res <- data.frame(do.call(rbind, results))''')
    R('''colnames(out.res) <- c("PCC", "pvalue")''')
    py_out = py2ri.ri2py_dataframe(R["out.res"])

    return py_out
コード例 #40
0
ファイル: Txpsi_CeFra-lme.py プロジェクト: agillen/LABRAT
def getdpsis(psifile):
    #Given a table of psis, calculate LME-based p values
    psidf = pd.read_table(psifile, sep='\t', header=0, index_col=False)

    pandas2ri.activate(
    )  #allow conversion between pandas dataframes and r dataframes

    #define R packages
    nlme = importr('nlme')
    base = importr('base')
    stats = importr('stats')
    qv = importr('qvalue')

    #define formulae
    fmla = Formula('value ~ 1 + conda + polyA')
    rndm = Formula('~ 1 | samples')
    nullfmla = Formula('value ~ 1 + polyA')
    nullrndm = Formula('~1 | samples')

    #Remove any gene that has a psi of NA in any sample
    psidf = psidf.dropna(axis=0)

    #Store relationships of conditions and the samples in that condition
    #It's important that this dictionary be ordered because we are going to be iterating through it
    fracs = ['cytosol', 'membrane', 'insoluble', 'total']

    for combination in combinations(fracs, 2):
        fraca = combination[0]
        fracb = combination[1]
        pvalues = []

        samp_conds = OrderedDict({
            fraca: {
                'polyA': [fraca + '_polyA_Rep1', fraca + '_polyA_Rep2'],
                'ribodep': [fraca + '_ribodep_Rep1', fraca + '_ribodep_Rep2']
            },
            fracb: {
                'polyA': [fracb + '_polyA_Rep1', fracb + '_polyA_Rep2'],
                'ribodep': [fracb + '_ribodep_Rep1', fracb + '_ribodep_Rep2']
            }
        })

        #Get a list of all samples
        samps = []
        for cond in samp_conds:
            for libprep in samp_conds[cond]:
                samps += samp_conds[cond][libprep]

        #Iterate through rows, making a dictionary from every row, turning it into a dataframe, then calculating p value
        genecounter = 0
        for index, row in psidf.iterrows():
            genecounter += 1
            if genecounter % 1000 == 0:
                print 'Gene {0}...'.format(genecounter)

            d = getrowdict(index, row, samp_conds, fraca, fracb)

            #Turn this dictionary into a dataframe
            rowdf = pd.DataFrame.from_dict(d)

            #Get lme p value
            try:
                lm_alt = nlme.lme(fmla, random=rndm, data=rowdf,
                                  method='ML')  #test
                lm_null = nlme.lme(nullfmla,
                                   random=nullrndm,
                                   data=rowdf,
                                   method='ML')  #control
                logratio = (stats.logLik(lm_alt)[0] -
                            stats.logLik(lm_null)[0]) * 2
                pvalue = stats.pchisq(logratio, df=1, lower_tail=False)[0]
                #format decimal
                pvalue = float('{:.2e}'.format(pvalue))
            except RRuntimeError:
                print 'RRuntime error for {0}!'.format(row['Gene'])
                pvalue = 1.0

            pvalues.append(pvalue)

        #Turn list of pvalues into qvalues
        pvec = FloatVector(pvalues)
        #Get qvalues object
        qobj = qv.qvalue(p=pvec)
        #qvalues are index 2 of qvalue object
        qvalues = list(qobj[2])
        #format decimal
        qvalues = [float('{:.2e}'.format(qvalue)) for qvalue in qvalues]

        #Add pvalues and qvalues to df
        pvalcolname = fraca + '_vs_' + fracb + '_pval'
        qvalcolname = fraca + '_vs_' + fracb + '_qval'
        psidf[pvalcolname] = pvalues
        psidf[qvalcolname] = qvalues
    psidf.to_csv('Drosophilapsi.pval.txt', sep='\t', header=True, index=False)
コード例 #41
0
def r_cal_b(df):
    robjects.r('''
        # create a function `f`
        f <- function(df, verbose=FALSE) {
            if (verbose) {
                cat("I am calling f().\n")
            }          
            xMin<-min(df$x)
            xMax<-max(df$x)
            yMin<-min(df$y)
            yMax<-max(df$y)
            
            xy_PPP <- with(df, ppp(x, y, c(xMin,xMin+50), c(yMin,yMin+50)))            
            #xy_PPP <- with(df, ppp(x, y, c(xMin,xMax), c(yMin,yMax)))
            #xy_PPP <- with(df, ppp(x, y, c(-25,25), c(-25,25)))
            #plot(xy_PPP)
            
            xy=df
            summary(xy)
            xy <- unique(xy)
            xy<-data.matrix(xy)
            # mean center
            mc <- apply(xy, 2, mean)   
            # standard distance
            sd <- sqrt(sum((xy[,1] - mc[1])^2 + (xy[,2] - mc[2])^2) / nrow(xy))
            #study area
            buffer_area=50*50
            #Density
            dens <- nrow(xy) / buffer_area
            library(spatstat)
            win<-owin(c(-25,25), c(-25,25))

            #弃之,python下无法安装rspatial,R环境下可以
            #library(devtools)
            #if (!require("rspatial")) devtools::install_github('rspatial/rspatial')
            #remotes::install_github("rspatial/rspatial")
            #devtools::install_github("rspatial/rspatial")
            #devtools::install_github("rstudio/sparkapi")
            
            #library(rspatial)
            #r <- raster(win)
            #样方统计
            quadrat_C<-quadratcount(xy_PPP,nx=5,ny=5)
            #plot(quadrat_C)
            # number of quadrats
            quadrats <- sum(quadrat_C)
            f<-table(quadrat_C)
            f<-data.frame(f)
            # number of cases
            cases <- sum(as.integer(f$quadrat_C) * f$Freq)
            mu <- cases / quadrats
            
            ff <- data.frame(as.integer(f$quadrat_C),f$Freq)
            colnames(ff) <- c('K', 'X')
            ff$Kmu <- ff$K - mu
            ff$Kmu2 <- ff$Kmu^2
            ff$XKmu2 <- ff$Kmu2 * ff$X
            #The observed variance s2 is
            s2 <- sum(ff$XKmu2) / (sum(ff$X)-1)
            #the VMR is
            VMR <- s2 / mu
            
            #Estimators of the empty-space function F(r)
            Fs<-Fest(xy_PPP)
            #plot(Fs)
            F_km<-mean(Fs$km)
            
            #nearest-neighbour function G(r)
            Gs<-Gest(xy_PPP)
            G_km<--mean(Gs$km)
            
            newlist<-list(VMR,F_km,G_km)
            return(newlist)
        
            #return(VMR)
        }
        ''')
    r_f = robjects.r['f']
    pandas2ri.activate()
    r_DF = pandas2ri.py2ri(df[[
        "x", "y"
    ]])  #将python下的dataFrame转换为R下的data.frame,传入R语言,即robjects.r()定义的函数

    res = r_f(r_DF)  #返回R语言计算的结果
    # print("+"*50)
    # print(res)
    return res
コード例 #42
0
def mdl_fit(model_vars, df, y_param, ci_level=0.95):
    """
    Function to fit final model and extract modelling statistics
    Input: model variables as a list, dataframe holding all the data, 
    dependent variable, confidence level for reporting statistics i.e. 0.95 for 95% 
    Output: dataframe with model coefficients and statistics   
    """
    #----------------------------------------------------------------------
    # Import necessary modules
    import rpy2.robjects.numpy2ri
    rpy2.robjects.numpy2ri.activate()
    import numpy as np
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()
    from rpy2.robjects.packages import STAP
    import scipy.stats as stats
    #----------------------------------------------------------------------
    # Fit R model
    # Set R function as string to fit model and return results
    string_ord_mdl = """
    mdl_func <- function(formula,df) {
    	library(VGAM)
    	mdl1=vglm(formula,family=propodds, data=df)
    
    	ll=logLik(mdl1)
        coefficients_df=coef(summary(mdl1))
        coefficient_cols=colnames(coefficients_df)
        coefficient_rows=rownames(coefficients_df)
    	output<-list(ll,coefficients_df,coefficient_cols,coefficient_rows)
        return(output)
    }
        """
    # Transform pandas dataframe to R format
    rdf = pandas2ri.py2ri(df)
    # Set R formula as string using the model parameters and dependent variable
    formula = 'as.ordered(' + y_param + ') ~ ' + "+".join(model_vars)
    # Define R function to be used in Python
    ord_ll = STAP(string_ord_mdl, "ord_ll")
    # Fit model
    output_R = ord_ll.mdl_func(formula, rdf)
    # Extract data and place them in Pandas dataframe
    coeff_df_temp = output_R[1]
    coeff_df = pandas2ri.ri2py_dataframe(coeff_df_temp)
    cols_df = list(output_R[2])
    rows_df = list(output_R[3])
    coeff_df.columns = cols_df
    coeff_df.index = rows_df
    #----------------------------------------------------------------------
    # Calculate statistics
    # Number of parameters
    n_vars = len(coeff_df)
    # Degrees for freedom for t-distribution
    deg_free = len(df) - n_vars
    # Calculate alpha value from confidence interval
    alpha_ = 1.0 - ci_level
    # array to hold the low % confidence intervals
    low_arr = np.zeros(len(coeff_df))
    # array to hold the high % confidence intervals
    high_arr = np.zeros(len(coeff_df))
    # array to hold the Wald test p-values
    p_val_arr = np.zeros(len(coeff_df))
    # array to hold the t statistic
    t_value_arr = np.zeros(len(coeff_df))
    # loop counter variable
    index_arr = 0
    for index, row in coeff_df.iterrows():
        # Get standard error for variable coefficient from R model fit data
        std_error = row['Std. Error']
        # Get variable coefficient value from R model fit data
        coeff_value = row['Estimate']
        # Calculate t_critical statistic for desired confidence interval
        t_critical = stats.t.ppf(1 - (alpha_ / 2.), df=deg_free)
        # Calculate low - high confidence interval limits
        low_arr[index_arr] = coeff_value - (t_critical * std_error)
        high_arr[index_arr] = coeff_value + (t_critical * std_error)
        # t statistic calculation to get p-value
        t_value = coeff_value / std_error
        t_value_arr[index_arr] = t_value
        # Calculate p-value
        p_val_arr[index_arr] = 2.0 * \
            (1.0 - stats.t.cdf(np.abs(t_value), deg_free))
        index_arr += 1
    # Set arrays to dataframe columns
    coeff_df['Low ' + str((1.0 - alpha_) * 100) + '%'] = low_arr
    coeff_df['High ' + str((1.0 - alpha_) * 100) + '%'] = high_arr
    coeff_df['P Value'] = p_val_arr
    coeff_df['t Value'] = t_value_arr
    # Delete statistics of R model fit referring to normal distribution
    coeff_df.drop(['z value', 'Pr(>|z|)'], axis=1, inplace=True)
    # Return dataframe with model fit coefficients and statistics
    return coeff_df
コード例 #43
0
ファイル: R_util.py プロジェクト: scgnn/scGNN
def generateLouvainCluster(edgeList):
    # no weights
    # G = nx.Graph(edgeList)

    # weighted edges: networkx,does not work
    # https://github.com/vtraag/louvain-igraph
    # https://python-louvain.readthedocs.io/en/latest/api.html
    # G = nx.Graph()
    # G.add_weighted_edges_from(edgeList)
    # partition = community.best_partition(G,weight='weight')
    # valueResults = []
    # for key in partition.keys():
    #     valueResults.append(partition[key])

    # df = pd.DataFrame()
    # df['Cluster']=valueResults

    # R:
    # https://github.com/dgrun/RaceID3_StemID2_package/blob/master/R/VarID_functions.R
    fromVec = []
    toVec = []
    weightVec = []
    for edge in edgeList:
        fromVec.append(edge[0])
        toVec.append(edge[1])
        weightVec.append(edge[2])

    import rpy2.robjects as ro
    from rpy2.robjects.packages import importr
    from rpy2.robjects import r, pandas2ri
    pandas2ri.activate()

    igraph = importr('igraph')
    base = importr('base')
    fromV = ro.FloatVector(fromVec)
    toV = ro.FloatVector(toVec)
    # weightV= ro.FloatVector([0.1,1.0,1.0,0.1])
    weightV = ro.FloatVector(weightVec)
    links = ro.DataFrame({'from': fromV, 'to': toV, 'weight': weightV})
    g = igraph.graph_from_data_frame(links, directed=False)
    cl = igraph.cluster_louvain(g)

    def as_dict(vector):
        """Convert an RPy2 ListVector to a Python dict"""
        result = {}
        for i, name in enumerate(vector.names):
            if isinstance(vector[i], ro.ListVector):
                result[name] = as_dict(vector[i])
            elif len(vector[i]) == 1:
                result[name] = vector[i][0]
            else:
                result[name] = vector[i]
        return result

    cl_dict = as_dict(cl)
    df = pd.DataFrame()
    # df['Cluster']=cl_dict['membership']
    size = float(len(set(cl_dict['membership'])))

    listResult = []
    count = 0
    for i in range(len(cl_dict['membership'])):
        listResult.append(int(cl_dict['membership'][i]) - 1)
        count += 1

    return listResult, size
コード例 #44
0
def sctransform(adata,
                genes=2000,
                min_genes_per_cell=5,
                method='poisson',
                latent=None,
                batch=None,
                cores=1,
                memory=10,
                verbose=True):
    """
    Function to use scTransform. It needs at least the adata.obj['total_counts'] number of UMIs calculated in the data.
    """
    import numpy as np
    import rpy2.robjects as ro
    import anndata2ri
    import scanpy as sc
    from rpy2.robjects import pandas2ri
    from scipy.sparse import issparse
    import rpy2.rinterface_lib.callbacks
    import logging
    if not verbose:
        rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

    ro.r('library(scater)')
    ro.r('library(sctransform)')
    ro.r('library(future)')
    pandas2ri.activate()
    anndata2ri.activate()

    print('Filtering genes')
    sc.pp.filter_genes(adata, min_cells=min_genes_per_cell)

    if issparse(adata.X):
        ro.globalenv['rawMatrix'] = adata.X.T.todense()
    else:
        ro.globalenv['rawMatrix'] = adata.X.T

    latent_var = []

    if latent is None:
        ro.r('cells_info = as.data.frame( colSums(rawMatrix) )')
        ro.globalenv['cellnames'] = np.asarray(adata.obs_names)
        ro.r('rownames(cells_info) = cellnames')
    else:
        latent_var = latent
        ro.globalenv['cells_info'] = adata.obs[latent_var]
        latent_var = ['"data.' + i + '"' for i in latent_var]
    ro.globalenv['genes_name'] = adata.var_names

    ro.r('cell_df <- DataFrame(data = cells_info)')
    #ro.r('print(head(cell_df))')
    #ro.r('print(rownames(cell_df)[1:10])')
    #ro.r('rawMatrix=as.data.frame(rawMatrix)')
    ro.r('colnames(rawMatrix) <- rownames(cell_df)')
    ro.r('rownames(rawMatrix) <- genes_name')
    print('Configure future multithreading')
    ro.globalenv['cores'] = cores
    ro.globalenv['memory'] = memory
    ro.r('future::plan(strategy = \'multicore\', workers = cores)')
    ro.r('options(future.globals.maxSize = memory * 1024 ^ 3)')
    print('Run scTransform')
    ro.globalenv['genes'] = int(genes)
    ro.globalenv['min_genes_per_cell'] = int(min_genes_per_cell)
    ro.globalenv['method'] = method
    stringCommand = 'vst_out=vst( as.matrix(rawMatrix), cell_attr=cell_df, n_genes=genes, method=method, show_progress=TRUE, min_cells=min_genes_per_cell, return_corrected_umi=TRUE'
    #latent_var = ['"data.'+i+'"' for i in latent_var]
    if batch is not None:
        batch = '"data.' + batch + '"'
        stringCommand = stringCommand + ', batch_var=' + batch
        if latent is not None:
            latent_var.remove(batch)
    if ((len(latent_var) > 1) and
        (batch is not None)) | ((len(latent_var) >= 1) and (batch is None)):
        #print(latent_var)
        stringCommand = stringCommand + ', latent_var=c(' + ','.join(
            latent_var) + ')'
    stringCommand += ')'
    print("Running the command:", stringCommand)
    ro.r(stringCommand)
    print('Extract results')
    new_matrix = ro.r('vst_out$y')
    sct_genes = ro.r('rownames(vst_out$model_pars)')
    all_genes = ro.r('rownames(vst_out$y)')
    umi_corrected = ro.r('vst_out$umi_corrected')

    adata = adata[:, all_genes].copy()
    adata.var['highly_variable'] = [i in sct_genes for i in adata.var_names]
    adata.layers['norm_sct'] = np.transpose(new_matrix)
    adata.layers['umi_corr'] = umi_corrected.T.copy()

    return adata
コード例 #45
0
def runme(count_matrix_path, design_matrix_path, gene_column):
    import pandas as pd
    import rpy2
    from rpy2.robjects import pandas2ri, Formula, r

    assert rpy2.__version__, '2.9.1' ' Please install rpy2 2.9.1 to run this script'
    assert pd.__version__, '0.19' ' Please install pandas 0.19 to run this script'

    pandas2ri.activate()
    from rpy2.robjects.packages import importr

    try:
        deseq = importr('DESeq2')
    except:
        EnvironmentError('Please install DESeq2 in your R environment')

    # Necessary to translate R dataframe back to Pandas
    to_dataframe = r('function(x) data.frame(x)')

    print('Loading data with pandas')

    count_matrix_df = pd.read_csv(
        count_matrix_path,
        sep=',',
    )

    design_matrix_df = pd.read_csv(design_matrix_path, sep=',', index_col=0)

    class py_DESeq2:
        def __init__(self,
                     count_matrix,
                     design_matrix,
                     design_formula,
                     gene_column='id'):
            try:
                assert gene_column in count_matrix.columns, 'Wrong gene id column name'
                gene_id = count_matrix[gene_column]
            except AttributeError:
                sys.exit('Wrong Pandas dataframe?')

            self.dds = None
            self.deseq_result = None
            self.resLFC = None
            self.comparison = None
            self.normalized_count_matrix = None
            self.gene_column = gene_column
            self.gene_id = count_matrix[self.gene_column]

            count_matrix = count_matrix.drop(gene_column, axis=1)

            print(
                f'Number of columns in counts data {count_matrix.shape[1]} | '
                f'Number of rows in design matrix {design_matrix.shape[0]}')

            # Load dataframe into R environment
            # Important: Change to r.data() if you use numpys and rpy2 latests versions
            count_matrix = pandas2ri.py2ri(count_matrix)

            # Assign columns to NULL
            count_matrix.names = rpy2.rinterface.NULL

            self.count_matrix = count_matrix

            self.design_matrix = pandas2ri.py2ri(design_matrix)

            self.design_formula = Formula(design_formula)

        def run_deseq(self, **kwargs):
            self.dds = deseq.DESeqDataSetFromMatrix(
                countData=self.count_matrix,
                colData=self.design_matrix,
                design=self.design_formula)
            self.dds = deseq.DESeq(self.dds, **kwargs)
            # Previous script had "deseq.counts" instead
            self.normalized_count_matrix = deseq.counts_DESeqDataSet(
                self.dds, normalized=True)

        def get_deseq_result(self, **kwargs):

            self.comparison = deseq.resultsNames(self.dds)

            self.deseq_result = deseq.results(self.dds, **kwargs)
            self.deseq_result = to_dataframe(self.deseq_result)
            self.deseq_result = pandas2ri.ri2py(
                self.deseq_result)  ## back to pandas dataframe
            self.deseq_result[self.gene_column] = self.gene_id.values
            return self.deseq_result

    print('Creating R objects')
    deseq2_exp = py_DESeq2(count_matrix=count_matrix_df,
                           design_matrix=design_matrix_df,
                           design_formula='~ class_label',
                           gene_column=gene_column)

    print('Running DESeq2 scripts...please be patient')

    deseq2_exp.run_deseq()

    print('Almost done...getting the results ready')

    results = deseq2_exp.get_deseq_result()

    results.to_csv('results.csv')

    print('Done!')
コード例 #46
0
def calculate_measures(x):
    # Save ts version of our data for some of the below functions
    #rbase.set_seed(123) # reproducibility seed
    #x_ts_contiguous = r.ts(FloatVector(na_contiguous(x)))
    #print(x_ts_contiguous)

    # Now "activate" pandas2ri and numpy2ri
    pandas2ri.activate()
    numpy2ri.activate()

    N = len(x)
    freq = find_freq_r(x)
    fx = (math.exp((freq - 1) / 50) - 1) / (1 + math.exp((freq - 1) / 50))

    # Decomposition
    decomp_x = decompose(x)

    # Adjust data
    # Unfortunately it looks like frequency is calculated a different way in the decompose function
    # Thus there may be data for which this function is evaulated when 'seasonality' is null
    # Going to add an extra check to make sure to not evaluate this if all the values are null
    #print(decomp_x['seasonality'])
    if freq > 1 and (not decomp_x['seasonality'].isnull().all()):
        fit = decomp_x['trend'] + decomp_x['seasonality']
    else:
        # Nonseasonal data
        fit = decomp_x['trend']
    adj_x = decomp_x['x'] - fit + np.mean(decomp_x['trend'].dropna())

    # Backtransformation of adjusted data
    if decomp_x['transform']:
        # The below line of code doesn't work for some reason
        #t_adj_x = inv_boxcox(adj_x.values, decomp_x['lambda'])
        # Use actual formula instead (but do inverse because we're solving for x)
        '''
        The Box-Cox transform is given by:

            y = (x**lmbda - 1) / lmbda,  for lmbda > 0
                log(x),                  for lmbda = 0
        '''
        if decomp_x['lambda'] == 0:
            # Assuming base of 10 (x = 10^y)
            t_adj_x = 10**adj_x
        else:
            # x = ((y * lambda) + 1) ^ (1/lambda)
            t_adj_x = ((adj_x * decomp_x['lambda']) + 1)**(1 /
                                                           decomp_x['lambda'])
    else:
        t_adj_x = adj_x

    # Trend and seasonal measures
    v_adj = np.var(adj_x.dropna())
    threshold = 0.00000000001
    if (freq > 1):
        detrend = decomp_x['x'] - decomp_x['trend']
        deseason = decomp_x['x'] - decomp_x['seasonality']

        if np.var(deseason.dropna()) < threshold:
            trend = 0
        else:
            trend = max(0, min(1, 1 - (v_adj / np.var(deseason.dropna()))))
        if np.var(detrend.dropna()) < threshold:
            seasonality = 0
        else:
            seasonality = max(0, min(1,
                                     1 - (v_adj / np.var(detrend.dropna()))))
    else:
        # Nonseasonal data
        if np.var(decomp_x['x'].dropna()) < threshold:
            trend = 0
        else:
            trend = max(0, min(1,
                               1 - (v_adj / np.var(decomp_x['x'].dropna()))))
        seasonality = 0

    measures = [fx, trend, seasonality]

    # Measures on original data
    xbar = np.mean(x.dropna())
    std = np.std(x.dropna())

    # Serial correlation (make sure box pierce statistic is returned as well)
    #bp = boxpierce(x, lags=max_lag)
    #Had to fix stattest module in pypr package via: https://gist.github.com/betterxys/1def38e1fcbb7f3b2dab2393bcea52f0
    max_lag = 10
    lbvalue, pvalue, bpvalue, bppvalue = acorr_ljungbox(x,
                                                        lags=max_lag,
                                                        boxpierce=True)
    # The above returns values for each lag, so just grab the final value
    Q = bpvalue[-1] / (N * max_lag)
    fQ = f2_transformation(Q, 7.53, 0.103)

    # Nonlinearity (THIS REQUIRES THE TIMESERIES OBJECT VERSION OF OUR DATA)
    '''
    non_linear_test = rtseries.terasvirta_test_ts(x_ts_contiguous,type = "Chisq")
    #non_linear_test = rtseries.terasvirta_test_default(y=x_contiguous,x=x_contiguous.index.dayofyear,type = "Chisq")
    p = non_linear_test[np.where(non_linear_test.names == 'statistic')[0].item()][0]
    fp = f1_transformation(p,0.069,2.304)
    '''
    fp = None

    # Skewness
    skew = abs(np.mean((x.dropna() - xbar)**3) / std**3)
    fs = f1_transformation(skew, 1.510, 5.993)

    # Kurtosis
    kurtosis = np.mean((x.dropna() - xbar)**4) / std**4
    fk = f1_transformation(kurtosis, 2.273, 11567)

    # Hurst=d+0.5 where d is fractional difference
    hurst = rfracdiff.fracdiff(na_contiguous(x), 0, 0)
    H = hurst[np.where(hurst.names == 'd')[0].item()].item() + 0.5

    # Lyapunov Exponent
    if freq > (N - 10):
        # There is insufficient data, declare this variable as none
        fLyap = None
    else:
        Ly = np.zeros(N - freq)
        for i in range(0, (N - freq)):
            diffs = abs(x.iloc[i] - x)
            date_idx = diffs.sort_values().index
            int_idx = pd.Index(
                [diffs.index.get_loc(date) for date in date_idx])
            idx = int_idx[int_idx < (N - freq)]
            j = idx[1]
            try:
                Ly[i] = math.log(
                    abs((x.iloc[i + freq] - x.iloc[j + freq]) /
                        (x.iloc[i] - x.iloc[j]))) / freq
            except ValueError:  # domain error, means log(0) was taken
                Ly[i] = 0
            if (np.isnan(Ly[i]) or (Ly[i] == np.Inf) or (Ly[i] == -np.Inf)):
                Ly[i] = np.nan
        Lyap = np.mean(Ly[~np.isnan(Ly)])
        fLyap = math.exp(Lyap) / (1 + math.exp(Lyap))

    measures = measures + [fQ, fp, fs, fk, H, fLyap]

    # Measures on adjusted data
    xbar = np.mean(t_adj_x.dropna())
    std = np.std(t_adj_x.dropna())

    # Serial correlation (make sure box pierce statistic is returned as well)
    #bp = boxpierce(adj_x, lags=max_lag)
    max_lag = 10
    lbvalue, pvalue, bpvalue, bppvalue = acorr_ljungbox(na_contiguous(adj_x),
                                                        lags=max_lag,
                                                        boxpierce=True)
    # The above returns values for each lag, so just grab the final value
    Q = bpvalue[-1] / (N * max_lag)
    fQ = f2_transformation(Q, 7.53, 0.103)

    # Nonlinearity (add try/except block to capture data where this doesn't work)
    # (THIS REQUIRES THE TIMESERIES OBJECT VERSION OF OUR DATA)
    try:
        adj_x_contiguous = na_contiguous(adj_x)
        non_linear_test = rtseries.terasvirta_test_ts(adj_x_contiguous,
                                                      type="Chisq")
        #non_linear_test = rtseries.terasvirta_test_default(y=adj_x_contiguous,x=adj_x_contiguous.index.dayofyear,type = "Chisq")
        p = non_linear_test[np.where(
            non_linear_test.names == 'statistic')[0].item()][0]
        fp = f1_transformation(p, 0.069, 2.304)
    except ValueError:
        print('This block did not work for the following data:\n', adj_x)

    # Skewness
    skew = abs(np.mean((t_adj_x.dropna() - xbar)**3) / (std**3))
    fs = f1_transformation(skew, 1.510, 5.993)

    # Kurtosis
    kurtosis = np.mean((t_adj_x.dropna() - xbar)**4) / (std**4)
    fk = f1_transformation(kurtosis, 2.273, 11567)

    measures_list = measures + [fQ, fp, fs, fk]

    measures_df = pd.DataFrame.from_dict(
        measures_dct,
        orient='index',
        columns=[
            "frequency", "trend", "seasonal", "autocorrelation", "non-linear",
            "skewness", "kurtosis", "Hurst", "Lyapunov", "dc autocorrelation",
            "dc non-linear", "dc skewness", "dc kurtosis"
        ])

    return measures_df
コード例 #47
0
ファイル: rutils.py プロジェクト: tungk/traja
try:
    import rpy2.robjects.pandas2ri as rpandas
except ModuleNotFoundError as e:
    if "tzlocal" in e.msg:
        raise ModuleNotFoundError(
            e.msg + "\n Install tzlocal with `pip install tzlocal`."
        )
    else:
        raise ModuleNotFoundError(e)
from rpy2.robjects.packages import importr


__all__ = ["import_adehabitat", "import_trajr", "plot_ltraj", "to_trajr", "to_ltraj"]


rpandas.activate()

ADEHABITAT_INSTALLED = False
TRAJR_INSTALLED = False


def import_adehabitat(suppress_messages=True):
    global ADEHABITAT_INSTALLED
    if not ADEHABITAT_INSTALLED:
        utils = rpackages.importr("utils", suppress_messages=suppress_messages)
        print("Importing adehabitat")
        utils.chooseCRANmirror(ind=1)
        utils.install_packages("adehabitatLT")
        ADEHABITAT_INSTALLED = True
    adehabitat = importr("adehabitatLT", suppress_messages=suppress_messages)
    return adehabitat
コード例 #48
0
ファイル: __init__.py プロジェクト: kathrinjansen/cgat
def consensusClustering(infile,
                        cutHeight,
                        cluster_algorithm,
                        min_size=30,
                        deepsplit=False):
    '''
    hierachichal clustering based on gene-cluster correlation across
    resampled datasets.  cut tree based with dynamic tree cut
    TODO: change this to cutHeight?  i.e. 0.2 = 80% clustering
    agreement OR use dynamic tree cut without deepsplit.
    '''
    condition = infile.split("/")[1].split("-")[0]
    wgcna_out = "tmp.dir/consensus-WGCNA.out"

    R('''sink(file='%(wgcna_out)s')''' % locals())
    R('''suppressMessages(library("WGCNA"))''')
    R('''suppressMessages(library("flashClust"))''')

    E.info("loading distance matrix")

    df = pd.read_table(infile, sep="\t", header=0, index_col=0)
    labels = df.index.tolist()
    labels_r = ro.StrVector([l for l in labels])

    # py2ri requires activation
    pandas2ri.activate()
    df_r = pandas2ri.py2ri(df)

    R.assign("distance.frame", df_r)
    R.assign("labels", labels_r)

    # large matricies/distance objects may need more
    # memory - allocate 1GB
    R('''memory.limit(10000)''')
    R('''rownames(distance.frame) <- labels''')
    R('''distance_data <- data.matrix(distance.frame)''')

    E.info("clustering data by %s linkage" % cluster_algorithm)

    R('''clustering <- flashClust(as.dist(1-distance_data),'''
      '''method='%(cluster_algorithm)s')''' % locals())

    if cutHeight > float(0.01):
        R('''cluster_cut <- cutreeStatic(dendro=clustering, '''
          '''minSize=%(min_size)i, cutHeight=%(cutHeight)s)''' % locals())

    elif deepsplit:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''deepSplit=T, minClusterSize=%(min_size)i)''' % locals())
    else:
        R('''cluster_cut <- cutreeDynamic(dendro=clustering, '''
          '''deepSplit=F, minClusterSize=%(min_size)i)''' % locals())

    R('''color_cut <- labels2colors(cluster_cut)''')
    R('''cluster_matched <- data.frame(cbind(rownames(distance_data),'''
      '''color_cut))''')
    R('''colnames(cluster_matched) = c("gene_id", "cluster")''')
    R('''cluster_matched <- data.frame(cluster_matched$gene_id,'''
      '''cluster_matched$cluster)''')

    # plot and save dendrogram of clustering
    # AH: disabled, requires plots.dir to exist which might not be the case
    # AH: and thus causes this method to fail. Path names need to be parameterizable.
    # R('''png("plots.dir/%(condition)s-dendrogram-consensus_clustering.png")'''
    #   % locals())
    # R('''plotDendroAndColors(dendro=clustering, colors=color_cut,'''
    #   '''groupLabels="Dynamic tree cut",'''
    #   '''dendroLabels=F, addGuide=T, guideHang=0.05, '''
    #   '''hang=0.03, main="%(condition)s")''' % locals())
    # R('''dev.off()''')
    # R('''sink(file=NULL)''')
    cluster_frame = pandas2ri.ri2py(R["cluster_matched"])

    return cluster_frame
コード例 #49
0
    def fit_model(
            self,
            r_home: str = "",
            r_path: str = r"",
            *args,
            **kwargs
    ):
        """
        Fits Beta-Binomial model.

        Parameters
        ----------
        method
            method that is used to calculate p-values 
        r_home
            path to R installation on your machine, e.g. "C:/Program Files/R/R-4.0.3"
        r_path
            path to R executable on your machine, e.g. "C:/Program Files/R/R-4.0.3/bin/x64"
        args
            passed to `corncob`
        kwargs
            passed to `corncob`
        Returns
        -------
        """

        os.environ["R_HOME"] = r_home
        os.environ["PATH"] = r_path + ";" + os.environ["PATH"]

        K = self.y.shape[1]

        if self.y.shape[0] == 2:
            p_val = [0 for _ in range(K)]
            self.result = None
        else:

            import rpy2.robjects as rp
            from rpy2.robjects import numpy2ri, pandas2ri
            numpy2ri.activate()
            pandas2ri.activate()

            if self.y.shape[0] == 4:
                phi = 1
            else:
                phi = self.covariate_column
            
            p_val = rp.r(f"""
            library(corncob)
            library(phyloseq)
            
            
            #prepare phyloseq data format
            
            counts = {pandas2ri.py2rpy_pandasdataframe(pd.DataFrame(self.y, columns=self.var.index)).r_repr()}
            
            sample = {pandas2ri.py2rpy_pandasdataframe(pd.DataFrame(self.x, columns=[self.covariate_column])).r_repr()}
            
            cell_types = colnames(counts)
            
            OTU = otu_table(counts, taxa_are_rows = FALSE)
            
            #create phyloseq data object
            data = phyloseq(OTU, sample_data(sample))
            
            corncob_out = differentialTest(formula = ~ {self.covariate_column},
                                  phi.formula = ~ {phi},
                                  formula_null = ~ 1,
                                  phi.formula_null = ~ {phi},
                                  test = "LRT",
                                  boot = FALSE,
                                  data = data,
                                  fdr_cutoff = 0.05
                                  )
            
            # Test functions on a single cell type
            
            #    corncob = bbdml(formula = cell_type ~ 1,
            #                    phi.formula = ~ 1,
            #                    data = data)
            #    corncob_DA = bbdml(formula = cell_type ~ {self.covariate_column},
            #                    phi.formula = ~ {self.covariate_column},
            #                    data = data)
            #    p_vals[cell_type] = lrtest(mod_null = corncob, mod = corncob_DA)
            
             p_vals = corncob_out$p_fdr 
            
             p_vals
            """)

        self.p_val = p_val
コード例 #50
0
def main(arg_dir='output', respMatrix=None, arg_url=None):
    #mongoClient = arguments.url

    #Importa o pacote utils do R para instalar e importar pacotes R
    utils = rpackages.importr('utils')
    utils.chooseCRANmirror(ind=1)

    #Lista de pacotes R para instalar
    #O pacote ltm é usado para o calculo dos parametros do IRT
    packnames = ('ltm', 'ltm')

    #Verifica se o pacote ja esta instalado, caso não, instala
    names_to_install = [x for x in packnames if not rpackages.isinstalled(x)]
    if len(names_to_install) > 0:
        print('Instalando o pacote ltm do R\n')
        utils.install_packages(StrVector(names_to_install))

    #Importa o pacore ltm do R
    ltm = rpackages.importr('ltm')
    pandas2ri.activate()

    if arg_dir != '':
        if not os.path.exists(arg_dir):
            os.makedirs(arg_dir)
        out = '/' + arg_dir
    else:
        out = ''

    #Pega todos os arquivos contendo os valores para o IRT
    list_data_irt = []
    if respMatrix == None:
        #Lista todos os diretorios de datasets da pasta output
        list_dir = os.listdir(os.getcwd() + out)
        for path in list_dir:
            #    if os.path.exists(os.getcwd()+out+'/'+path+'/'+path+'_irt.csv'):
            try:
                read = csv.reader(
                    open(
                        os.getcwd() + out + '/' + path + '/' + path +
                        '_irt.csv', "r"))
                list_data_irt.append(path + '_irt.csv')
            except IOError:
                print(
                    'Nao foi encontrado o arquivo para calculo do irt do dataset ',
                    path)
    else:
        list_data_irt.append(respMatrix)

    #file = ('heart-statlog_irt.csv')
    #data = robjects.r('PL3.rasch<-tpm(read.csv(file="heart-statlog_irt.csv"))')

    #print('\nIniciando calculo dos parametros do IRT para os datasets: ',list_dir)
    #Inicia o calculo do IRT para todos os datasets
    for f in range(len(list_data_irt)):

        print("Calculando os parametros do IRT para o dataset: ",
              list_data_irt[f])

        #Calcula os parametros do IRT com o pacote ltm do R
        if respMatrix == None:
            file = os.getcwd(
            ) + '/' + out + '/' + list_dir[f] + '/' + list_data_irt[f]
        else:
            file = formatMatrix(list_data_irt[f])
        file = file.replace('\\', '/')
        #try:
        data = robjects.r('tpm(read.csv(file="' + file +
                          '"),IRT.param = TRUE)')
        #except:
        #   data = robjects.r('tpm(read.csv(file="'+file+'"),control = list(optimizer = "nlminb"))')

        #Trata os dados dos parametros
        par = (str(data).split('\n'))

        #Adciona os parametros em um dicionario
        parameter_dict = {}
        parameters = ['Discriminacao', 'Dificuldade', 'Adivinhacao']
        for i in range(len(par)):
            try:
                if par[i][0] == 'V':
                    pass
                else:
                    continue
            except:
                continue
            item = par[i].split()[0]
            tmp_dict = {}
            for p in range(3):
                tmp_dict[parameters[p]] = float(par[i].split()[3 - p])
            parameter_dict[item] = tmp_dict

            list_dis = []
            list_dif = []
            list_adv = []
            for i in parameter_dict:
                list_dis.append(parameter_dict[i]['Discriminacao'])
                list_dif.append(parameter_dict[i]['Dificuldade'])
                list_adv.append(parameter_dict[i]['Adivinhacao'])


#        normalized_dis = normalize(list_dis,-4,4)
#        normalized_dif = normalize(list_dif,-4,4)
#        c = 0
#        for i in parameter_dict:
#            parameter_dict[i]['Discriminacao'] = normalized_dis[c]
#            parameter_dict[i]['Dificuldade'] = normalized_dif[c]
#            c += 1

        dataframe = pd.DataFrame.from_dict(parameter_dict)
        dataframe = dataframe.reindex(index=parameters)
        #break
        #Salva os parametros do IRT na pasta de cada dataset
        if respMatrix == None:
            dataframe.transpose().to_csv(r'' + os.getcwd() + out + '/' +
                                         list_dir[f] + '/irt_item_param.csv')
        else:
            os.remove(file)
            dataframe.transpose().to_csv(r'' + os.getcwd() + out +
                                         '/irt_item_param.csv')
        #Insere os dados do IRT no MongoDB
        if arg_url != None:
            try:
                insertMongo(parameter_dict, arg_url, list_dir[f])
                print('==> Dados salvos com sucesso :)\n')
            except:
                print(
                    "Não foi possivel inserir os dados no MongoDB :/ \nVerifique se a url passada do banco está correta, assim como nome e senha\n"
                )
コード例 #51
0
def save_rds(data, filename):
    import collections, re
    import pandas as pd
    import numpy as np
    import rpy2.robjects as RO
    import rpy2.rinterface as RI
    from rpy2.robjects import numpy2ri
    numpy2ri.activate()
    from rpy2.robjects import pandas2ri
    pandas2ri.activate()
    # Supported data types:
    # int, float, str, tuple, list, numpy array
    # numpy matrix and pandas dataframe
    int_type = (int, np.int8, np.int16, np.int32, np.int64)
    float_type = (float, np.float)

    def assign(name, value):
        name = re.sub(r'[^\w' + '_.' + ']', '_', name)
        if isinstance(value, (tuple, list)):
            if all(isinstance(item, int_type) for item in value):
                value = np.asarray(value, dtype=int)
            elif all(isinstance(item, float_type) for item in value):
                value = np.asarray(value, dtype=float)
            else:
                value = np.asarray(value)
        if isinstance(value, np.matrix):
            value = np.asarray(value)
        if isinstance(
                value,
                tuple(flatten_list((str, float_type, int_type, np.ndarray)))):
            if isinstance(value, np.ndarray) and value.dtype.kind == "u":
                value = value.astype(int)
            RO.r.assign(name, value)
        elif isinstance(value, pd.DataFrame):
            # FIXME: does not always work well for pd.DataFrame
            RO.r.assign(name, value)
        elif value is None:
            RO.r.assign(name, RI.NULL)
        else:
            raise ValueError(
                "Saving ``{}`` to RDS file is not supported!".format(
                    str(type(value))))

    #
    def assign_dict(name, value):
        RO.r('%s <- list()' % name)
        for k, v in value.items():
            k = re.sub(r'[^\w' + '_.' + ']', '_', str(k))
            if k.isdigit():
                k = str(k)
            if isinstance(v, collections.Mapping):
                assign_dict('%s$%s' % (name, k), v)
            else:
                assign('item', v)
                RO.r('%s$%s <- item' % (name, k))

    #
    if isinstance(data, collections.Mapping):
        assign_dict('res', data)
    else:
        assign('res', data)
    RO.r("saveRDS(res, '%s')" % filename)
コード例 #52
0
ファイル: RnaseqqcReport.py プロジェクト: wbyu/CGATPipelines
class SampleHeatmap(RnaseqqcTracker):

    table = "sailfish_transcripts"
    py2ri.activate()

    def getTracks(self, subset=None):
        return ("all")

    def getCorrelations(self, dataframe):
        '''
        Perform hierarchical clustering on a
        dataframe of expression values

        Arguments
        ---------
        dataframe: pandas.Core.DataFrame
          a dataframe containing gene IDs, sample IDs
          and gene expression values

        Returns
        -------
        corr_frame: pandas.Core.DataFrame
          a dataframe of a pair-wise correlation matrix
          across samples.  Uses the Pearson correlation.
        '''

        # set sample_id to index
        pivot = dataframe.pivot(index="sample_name",
                                columns="transcript_id",
                                values="TPM")
        transpose = pivot.T
        # why do I have to resort to R????
        r_df = py2ri.py2ri_pandasdataframe(transpose)
        R.assign("p.df", r_df)
        R('''p.mat <- apply(p.df, 2, as.numeric)''')
        R('''cor.df <- cor(p.mat)''')
        r_cor = R["cor.df"]
        py_cor = py2ri.ri2py_dataframe(r_cor)
        corr_frame = py_cor

        return corr_frame

    def getFactors(self, dataframe):
        '''Get factor/experimental design levels from table
        '''

        statement = ("SELECT factor_value, sample_name, factor "
                     "FROM factors AS f "
                     "JOIN samples AS s "
                     "ON f.sample_id = s.id "
                     "WHERE factor != 'genome'" % locals())

        factor_df = self.getDataFrame(statement)
        merged = pd.merge(dataframe,
                          factor_df,
                          left_index=True,
                          right_on="sample_name",
                          how='outer')
        return merged

    def __call__(self, track, slice=None):

        statement = ("SELECT s.sample_name, t.transcript_id, t.TPM "
                     "FROM %(table)s AS t, samples AS s "
                     "WHERE transcript_id != 'Transcript' "
                     "AND t.sample_id = s.id")
        df = self.getDataFrame(statement)
        mdf = self.getCorrelations(df)

        mdf.columns = set(df["sample_name"])
        mdf.index = set(df["sample_name"])

        all_df = self.getFactors(mdf)
        return all_df.set_index("factor")
コード例 #53
0
def nnd_hotdeck_using_rpy2(receiver=None,
                           donor=None,
                           matching_variables=None,
                           z_variables=None,
                           donor_classes=None):
    from rpy2.robjects.packages import importr
    from rpy2.robjects import pandas2ri

    assert receiver is not None and donor is not None
    assert matching_variables is not None

    pandas2ri.activate()
    StatMatch = importr("StatMatch")

    if isinstance(donor_classes, str):
        assert donor_classes in receiver, 'Donor class not present in receiver'
        assert donor_classes in donor, 'Donor class not present in donor'

    try:
        if donor_classes:
            out_NND = StatMatch.NND_hotdeck(
                data_rec=receiver,
                data_don=donor,
                match_vars=pd.Series(matching_variables),
                don_class=pd.Series(donor_classes))
        else:
            out_NND = StatMatch.NND_hotdeck(
                data_rec=receiver,
                data_don=donor,
                match_vars=pd.Series(matching_variables),
                # don_class = pd.Series(donor_classes)
            )
    except Exception as e:
        print(1)
        print(receiver)
        print(2)
        print(donor)
        print(3)
        print(pd.Series(matching_variables))
        print e

    # create synthetic data.set, without the
    # duplication of the matching variables

    fused_0 = pandas2ri.ri2py(
        StatMatch.create_fused(data_rec=receiver,
                               data_don=donor,
                               mtc_ids=out_NND[0],
                               z_vars=pd.Series(z_variables)))

    # create synthetic data.set, with the "duplication"
    # of the matching variables

    fused_1 = pandas2ri.ri2py(
        StatMatch.create_fused(data_rec=receiver,
                               data_don=donor,
                               mtc_ids=out_NND[0],
                               z_vars=pd.Series(z_variables),
                               dup_x=True,
                               match_vars=pd.Series(matching_variables)))

    return fused_0, fused_1
コード例 #54
0
def granger_causality(data,
                      cols,
                      y_var,
                      lags,
                      our_type,
                      list_subcausalities=False):
    y_subset = data[y_var]
    pandas2ri.activate()
    data = pandas2ri.py2ri(data)

    ### We define the functions;
    robjects.r('''
      is.installed <- function(mypkg){
        is.element(mypkg, installed.packages()[,1])
      } 

      # check if package "gtools" is installed
      if (!is.installed("gtools")){
        install.packages("gtools", INSTALL_opts = '--no-lock', repos='https://cloud.r-project.org')
      }
      if (!is.installed("vars")){
        install.packages("vars", INSTALL_opts = '--no-lock', repos='https://cloud.r-project.org')
      }      
         library("gtools")
         library("vars")
         for (k in .libPaths()){
           k <- paste0(k,"/00LOCK")
           unlink(k, recursive = TRUE)
         }
         get_p_value <- function(data,lags,y_values,causes,our_type){
         data <- as.data.frame(data)
         mycols <- c(as.character(unlist(causes)))
         mydata <- data[c(as.character(unlist(causes)))]
         mydata <- as.data.frame(mydata)
         mydata <- cbind(Temperatures = y_values,mydata)
         var.2c <- VAR(mydata, p = lags, type = our_type) ### In this case, we are using trended Granger causality
         my_vcov <- vcovHC(var.2c)
         mycause <- causality(var.2c, cause = mycols)
         return(c(mycause$Granger$p.value))
    }

    permuts <- function(data,order,y,columns,our_type){
      list_perms <- do.call("c", lapply(seq_along(columns), function(i) combn(columns, i, FUN = list)))
      d <- data.frame(x = NA, y = 1:length(list_perms))
      i <- 1
      columns <- unlist(columns)
      while (i<=length(list_perms)){
        myp <- get_p_value(data,order,y,list_perms[i][[1]],our_type = our_type)
        d[i,] <- c(toString(unlist(list_perms[i][[1]])),as.numeric(myp))
        i <- i + 1
      }
      colnames(d) <- c("Sets of variables","p-value")
      d$`p-value` <- as.numeric(d$`p-value`)
      return(d)
      #return(.libPaths())
      #return(unlist(list_perms[i-1][[1]]))
        }
            ''')

    r_f = robjects.globalenv['get_p_value']
    permuts = robjects.globalenv['permuts']
    robjects.r.library("vars")
    our_causes = robjects.r('as.data.frame')(cols)
    if list_subcausalities == True:
        mydf = permuts(data, lags, robjects.Vector(y_subset), our_causes,
                       our_type)
        return (mydf)

    return (r_f(data, lags, robjects.Vector(y_subset), our_causes, our_type))
コード例 #55
0
ファイル: __init__.py プロジェクト: kathrinjansen/cgat
def deseqNormalize(infile,
                   time_points,
                   reps,
                   conditions=None):
    '''
    Library size normalisation and variance stabilizing transformation of
    timeseries RNA-seq data

    :param infile: count table from NGS-seq experiment
    :type infile: str
    :param time_points: time point labels
    :type time_points: str list
    :param reps: replicates labels
    :type reps: str list
    :param conditions: if  multiple experimental conditions
    are to be normalised at the same time
    :type conditions: str list
    '''
    # MM: NB - this should be split into separate library size
    # normalisation and VST transformations
    # maybe add in different transformation options.

    pandas2ri.activate()
    reps = reps

    # load library
    R('''suppressMessages(library("DESeq"))''')

    # generates a lists for the design data frame
    # of the proper length
    # these need to be rpy2 objects to be parsed
    # properly in the string formatting

    E.info("converting to pandas dataframe object")

    if infile.split(".")[-1] == "gz":
        comp = "gzip"
    else:
        comp = None

    data_frame = pd.read_table(infile,
                               index_col=0,
                               header=0,
                               sep="\t",
                               compression=comp)
    # py2ri requires activation
    pandas2ri.activate()
    rdf = pandas2ri.py2ri(data_frame)

    if not conditions:
        time_rep_comb = [x for x in itertools.product(time_points, reps)]
        time_cond = ro.StrVector([x[0] for x in time_rep_comb])
        rep_cond = ro.StrVector([x[1] for x in time_rep_comb])

        R.assign('countsTable', rdf)
        R('''design <- data.frame(row.names=colnames(countsTable),'''
          '''times=%s, replicates=%s)''' % (time_cond.r_repr(),
                                            rep_cond.r_repr()))
    elif conditions:
        design_dict = {}
        for x in data_frame.columns.values:
            sample_dict = {}
            sample_dict['condition'] = str(x).split(".")[0]
            sample_dict['times'] = int(str(x).split(".")[1])
            sample_dict['replicates'] = str(x).split(".")[2]
            design_dict[x] = sample_dict
            design_frame = pd.DataFrame(design_dict)
            design_frame = design_frame.T

        des_cond = design_frame['condition'].values.tolist()
        des_time = design_frame['times'].values.tolist()
        des_reps = design_frame['replicates'].values.tolist()

        cond_cond = ro.StrVector([x for x in des_cond])
        time_cond = ro.StrVector([x for x in des_time])
        rep_cond = ro.StrVector([x for x in des_reps])

        R.assign('countsTable', rdf)
        R.assign('design', design_frame)

    # create the count data set and normalize to library size
    # transform with variance stabilizing transformation
    # only select genes with an average of ten reads mapping

    E.info("calculating size factors and dispersion")
    R('''notZero <- (rowMeans(countsTable) > 1)''')
    R('''cds <- newCountDataSet(countsTable[notZero, ], design)''')
    R('''cds_size <- estimateSizeFactors(cds)''')
    R('''cds_disp <- estimateDispersions(cds_size, method="blind")''')

    E.info("applying variance stabilizing transformation")

    R('''vst <- varianceStabilizingTransformation(cds_disp)''')

    # format data set to long format with condition and replicate labels
    # convert to a numpy array

    R('''replicates <- c(%s)''' % rep_cond.r_repr())
    R('''times <- c(%s)''' % time_cond.r_repr())
    if conditions:
        R('''conditions <- c(%s)''' % cond_cond.r_repr())
        R('''trans_vst = data.frame(t(exprs(vst)), '''
          '''times, replicates, conditions)''')
    else:
        R('''trans_vst = data.frame(t(exprs(vst)), times, replicates)''')

    # load data and convert to pandas object
    data_file = pandas2ri.ri2py(R["trans_vst"])
    
    return data_file
コード例 #56
0
def call_fitter(
    site_inputs_training,
    y_training,
    site_inputs_validation,
    hprm,
):
    assert y_training.ndim == 1
    path_R_files = os.path.join(
        paths.outputs,
        'R_files/',
    )
    os.makedirs(
        path_R_files,
        exist_ok=True,
    )

    ### Data
    data_training = {
        **{
            simplify_inpt_name(inpt, trsfm, prm, location): site_inputs_training[inpt, trsfm, prm, location].values
            for inpt, trsfm, prm, location in site_inputs_training
        },
        'target': y_training.values,
    }

    data_validation = {
        simplify_inpt_name(inpt, trsfm, prm, location):
        site_inputs_validation[inpt, trsfm, prm, location].values
        for inpt, trsfm, prm, location in site_inputs_validation
    }

    # Convert arrays
    pandas2ri.activate()
    df_train = pandas2ri.py2rpy(pd.DataFrame.from_dict(data_training))
    df_test = pandas2ri.py2rpy(pd.DataFrame.from_dict(data_validation))
    pandas2ri.deactivate()

    # Save converted files
    r.assign("data_train", df_train)
    r("save(data_train, file='{0}/temp_dat_for_r_train.gzip', compress=TRUE)".
      format(path_R_files))
    r.assign("data_test", df_test)
    r("save(data_test,  file='{0}/temp_dat_for_r_test.gzip',  compress=TRUE)".
      format(path_R_files))

    nb_unique = {k: len(np.unique(v)) for k, v in site_inputs_training.items()}

    string_formula = make_gam_formula(
        site_inputs_training.columns,
        nb_unique,
        hprm,
    )

    ### Launch the R script
    path2script = os.path.join(
        os.path.dirname(__file__),
        'load_fit_predict_savePredictions.R',
    )
    args = [string_formula, path_R_files]
    cmd = ['Rscript', path2script] + args
    # Python will quote what must be quoted in subprocess.check_output

    print('launch Rscript')
    x = subprocess.check_output(cmd, universal_newlines=True)
    print(x)

    y_hat_training = r['read.table'](
        "{0}/predictions_from_r_train.gzip".format(path_R_files))
    y_hat_training = pandas2ri.rpy2py(y_hat_training)
    y_hat_training = y_hat_training.values

    y_hat_validation = r['read.table'](
        "{0}/predictions_from_r_test.gzip".format(path_R_files))
    y_hat_validation = pandas2ri.rpy2py(y_hat_validation)
    y_hat_validation = y_hat_validation.values

    return y_hat_training, y_hat_validation
コード例 #57
0
ファイル: __init__.py プロジェクト: kathrinjansen/cgat
def covarFilter(infile,
                time_points,
                replicates,
                quantile):
    '''
    Filter gene list based on the distribution of the
    sums of the covariance of each gene.  This is highly
    recommended to reduce the total number of genes used
    in the dynamic time warping clustering to reduce the
    computational time.  The threshold is placed at the
    intersection of the expected and observed value
    for the given quantile.
    '''

    time_points.sort()
    time_rep_comb = [x for x in itertools.product(time_points, replicates)]
    time_cond = ro.StrVector([x[0] for x in time_rep_comb])
    rep_cond = ro.StrVector([x[1] for x in time_rep_comb])
    df = pd.read_table(infile, sep="\t", header=0, index_col=0)

    df.drop(['replicates'], inplace=True, axis=1)
    df.drop(['times'], inplace=True, axis=1)
    df = df.fillna(0.0)

    # convert data frame and import into R namespace
    # py2ri requires activation
    pandas2ri.activate()
    R.assign('diff_data', pandas2ri.py2ri(df))

    E.info("loading data frame")

    # need to be careful about column headers and transposing data frames

    R('''trans_data <- data.frame(diff_data)''')
    R('''times <- c(%s)''' % time_cond.r_repr())
    R('''replicates <- c(%s)''' % rep_cond.r_repr())

    # calculate the covariance matrix for all genes
    # sum each gene's covariance vector

    E.info("calculating sum of covariance of expression")

    R('''covar.mat <- abs(cov(trans_data))''')
    R('''sum.covar <- rowSums(covar.mat)''')
    R('''exp.covar <- abs(qnorm(ppoints(sum.covar),'''
      '''mean=mean(sum.covar), sd=sd(sum.covar)))''')
    R('''sum.covar.quant <- quantile(sum.covar)''')
    R('''exp.covar.quant <- quantile(exp.covar)''')

    E.info("filter on quantile")

    R('''filtered_genes <- names(sum.covar[sum.covar > '''
      '''sum.covar.quant[%(quantile)i]'''
      ''' & sum.covar > exp.covar.quant[%(quantile)i]])''' % locals())
    R('''filtered_frame <- data.frame(diff_data[, filtered_genes],'''
      '''times, replicates)''')

    # load data and convert to pandas object
    filtered_frame = pandas2ri.ri2py(R["filtered_frame"]).T

    return filtered_frame
コード例 #58
0
ファイル: recode_data.py プロジェクト: vcbradley/bdr
import numpy as np
import pandas as pd
from rpy2.robjects import r, pandas2ri
import datetime as dt
from sklearn import preprocessing as prep

# set directory
import os
os.chdir('/Users/valeriebradley/github/libdems/projection/')
pandas2ri.activate()  # to translate R obj into pandas df


def recode_covar_data():

    data_path = '/Users/valeriebradley/Documents/LibDems/data/'

    r['load'](data_path + "model_sample_data_cleaned.RData")
    dr_covars = r.dr_covars

    dr_covars['VANID'] = dr_covars['VANID'].astype(int)
    dr_covars.set_index('VANID', inplace=True)

    # calculate reg timing
    eday = '2017-06-08'
    dr_covars['year_reg'] = [
        dt.datetime.strptime(date, '%Y-%m-%d').date().year
        for date in dr_covars['DATE_OF_UPDATE'].values
    ]
    dr_covars['wks_from_reg_to_eday'] = [
        (dt.datetime.strptime(eday, '%Y-%m-%d') -
         dt.datetime.strptime(date, '%Y-%m-%d')) / dt.timedelta(days=1)
コード例 #59
0
import os
import sys
import itertools
import gzip
import numpy as np
from Bio import SeqIO
import argparse
import subprocess
import pandas as pd
from collections import OrderedDict
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri, Formula, FloatVector
from rpy2.rinterface import RRuntimeError
import math

pandas2ri.activate(
)  #allow conversion between pandas dataframes and r dataframes

#define R packages
nlme = importr('nlme')
base = importr('base')
stats = importr('stats')
#qv = importr('qvalue')

#define formulae
fmla = Formula('value ~ 1 + cond1')
rndm = Formula('~ 1 | samples')
nullfmla = Formula('value ~ 1')
nullrndm = Formula('~1 | samples')

samps = [
    'samp1A', 'samp1B', 'samp1C', 'samp1D', 'samp2A', 'samp2B', 'samp2C',
コード例 #60
0
    def _infer_network(self, data):
        """
        Infer the network.

        Args:
            data (pd.DataFrame): data to be used for the inference.
        """
        # quantization step with optimal k based on BIC
        quantized = data.apply(lambda column: k_means_vector_quantization(
            column.values.reshape(-1, 1),
            k_min=self.k_min,
            k_max=self.k_max,
            k_step=self.k_step,
            **self.parameters),
                               axis=0)
        entities = data.columns
        number_of_entities = len(entities)
        # activate implicit conversion from pandas to R objects
        pandas2ri.activate()
        fun_chisq = importr('FunChisq')
        # preparing variables to pass to FunChisq
        independent_variables = None
        dependent_variables = np.array([], dtype=int)
        ne_range = range(1, number_of_entities + 1)
        for index, entity in enumerate(entities):
            r_index = index + 1
            dependent_variables = np.hstack([
                dependent_variables,
                np.array(
                    list(filter(lambda e_index: e_index != r_index, ne_range)))
            ])
            if independent_variables is None:
                independent_variables = np.vstack(
                    [np.array(r_index) for _ in range(number_of_entities - 1)])
            else:
                independent_variables = np.vstack([
                    independent_variables,
                    np.vstack([
                        np.array(r_index)
                        for _ in range(number_of_entities - 1)
                    ])
                ])
        # running FunChisq
        interactions = ro.conversion.rpy2py(
            fun_chisq.test_interactions(quantized.T.values,
                                        list(independent_variables),
                                        pd.Series(dependent_variables),
                                        entities.values))
        # test correction
        if self.correction in CORRECTIONS:
            significants = CORRECTIONS[self.correction](
                interactions['p.value'], self.confidence_threshold)
            interactions = interactions.iloc[significants]
        interactions.columns = [
            'gene1', 'gene2', 'p-value', 'statistic', 'estimate'
        ]
        # if undirected keep only interaction with higher importance if
        # both directions are significant
        if self.undirected is True:
            interactions = interactions.apply(
                lambda row: pd.Series(sort_interaction_entities(row)), axis=1)
            interactions.columns = [
                'gene1', 'gene2', 'p-value', 'statistic', 'estimate'
            ]
            interactions['grouping'] = [
                '{}_{}'.format(*sorted(pair))
                for pair in zip(interactions['gene1'], interactions['gene2'])
            ]
            selected_interactions = interactions.groupby([
                'grouping'
            ])['p-value'].transform(min) == interactions['p-value']
            interactions = interactions[selected_interactions]
        # prepare the interactions
        interactions = interactions[['gene1', 'gene2', 'statistic']]
        interactions.columns = ['e1', 'e2', 'intensity']
        self.graph = InteractionTable(df=interactions).to_graph(
            undirected=self.undirected)
        logger.debug('inferred with {}'.format(self.method))