def loadDataSet(self, rDataFilename):
     pandas2ri.activate()
     base = importr('base')
     base.load(rDataFilename)
     self.trainSet = pandas2ri.ri2py_dataframe(base.mget('train')[0])
     self.testSet = pandas2ri.ri2py_dataframe(base.mget('test')[0])
     self.dataColumns = [
         x for x in list(self.trainSet.columns) if x[0] == 'i'
     ]
     self.classColumns = [
         x for x in list(self.trainSet.columns) if x[0] == 'c'
     ]
Beispiel #2
0
def get_protein_traces_by_id(protein_ids, id_type):
    result = cached_run_secexploerer(protein_ids, id_type)
    if result is None or result[1] == NULL or result[1][1] == NULL:
        return pd.DataFrame(), [], [0, 0], {}, {}

    traces = pandas2ri.ri2py_dataframe(result[1][0][0])
    traces = traces.set_index(["id"])
    traces.index.name = "protein_id"

    mapping_table = pandas2ri.ri2py_dataframe(result[0][3])
    if len(mapping_table.columns) == 3:
        mapping = dict(zip(mapping_table.iloc[:, 0], mapping_table.iloc[:, 2]))
    else:
        mapping = {}

    labels = []
    for uniprot_id in traces.index:
        extra_label = mapping.get(uniprot_id)
        if extra_label is not None:
            label = "%s (%s)" % (extra_label, uniprot_id)
            label = extra_label
        else:
            label = uniprot_id
        labels.append(label)

    features = pandas2ri.ri2py_dataframe(result[1][1])

    monomer_secs = {}
    monomer_intensities = {}
    for subunits, monomer_sec in zip(features.subunits_detected,
                                     features.monomer_sec):
        subunits = subunits.split(";")
        monomer_sec = monomer_sec.split(";")
        for (su, sec) in zip(subunits, monomer_sec):
            monomer_secs[su] = sec
            intensity = traces.loc[su, sec]
            monomer_intensities[su] = intensity

    new_subunits = []
    for subunits in features.subunits_detected:
        subunits = subunits.split(";")
        subunits = [mapping.get(su, su) for su in subunits]
        new_subunits.append(";".join(subunits))

    features["subunits_detected"] = new_subunits

    calibration_parameters = result[1][2]

    return traces, labels, calibration_parameters, monomer_secs, monomer_intensities
Beispiel #3
0
    def getCorrelations(self, dataframe):
        """
        Perform hierarchical clustering on a
        dataframe of expression values

        Arguments
        ---------
        dataframe: pandas.Core.DataFrame
          a dataframe containing gene IDs, sample IDs
          and gene expression values

        Returns
        -------
        corr_frame: pandas.Core.DataFrame
          a dataframe of a pair-wise correlation matrix
          across samples.  Uses the Pearson correlation.
        """

        # set sample_id to index
        pivot = dataframe.pivot(index="sample_name", columns="transcript_id", values="TPM")
        transpose = pivot.T
        # why do I have to resort to R????
        r_df = py2ri.py2ri_pandasdataframe(transpose)
        R.assign("p.df", r_df)
        R("""p.mat <- apply(p.df, 2, as.numeric)""")
        R("""cor.df <- cor(p.mat)""")
        r_cor = R["cor.df"]
        py_cor = py2ri.ri2py_dataframe(r_cor)
        corr_frame = py_cor

        return corr_frame
Beispiel #4
0
def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2):
    """"
    Invoke a named "correlation" R metric and convert the R dataframe result into
    a Pandas dataframe.
    :param r_stream1: an r_stream object
    :param r_stream2: an r_stream object
    :param evalresp1: pandas DataFrame of evalresp FAP for r_stream1
    :param evalresp2: pandas DataFrame of evalresp FAP for r_stream2
    :return:
    """
    R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric')

    # NOTE:  Conversion of dataframes only works if you activate but we don't want conversion
    # NOTE:  to always be automatic so we deactivate() after we're done converting.
    pandas2ri.activate()
    r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1)
    r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2)
    pandas2ri.deactivate()

    # TODO:  Can we just activate/deactivate before/after R_function() without converting
    # TODO:  r_evalresp1/2 ahead of time?

    # Calculate the metric
    r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2)
    r_dataframe = _R_metricList2DF(r_metriclist)
    pandas2ri.activate()
    df = pandas2ri.ri2py_dataframe(r_dataframe)
    pandas2ri.deactivate()

    # Convert columns from R POSIXct to pyton UTCDateTime
    df.starttime = df.starttime.apply(UTCDateTime)
    df.endtime = df.endtime.apply(UTCDateTime)
    return df
Beispiel #5
0
def main(args):
    importr('HMMcopy')
    titan = importr('TitanCNA')

    if args.target_bed_file is None:
        df = titan.correctReadDepth(
            args.tumour_wig_file,
            args.normal_wig_file,
            args.gc_wig_file,
            args.mappability_wig_file,
        )

    else:
        target_df = pd.read_csv(args.target_bed_file, header=None, sep='\t')

        df = titan.correctReadDepth(
            args.tumour_wig_file,
            args.normal_wig_file,
            args.gc_wig_file,
            args.mappability_wig_file,
            targetedSequence=pandas2ri.py2ri(target_df))

    df = pandas2ri.ri2py_dataframe(df)

    df.to_csv(args.out_file, index=False, sep='\t')
Beispiel #6
0
    def getCorrelations(self, dataframe):
        '''
        Perform hierarchical clustering on a
        dataframe of expression values

        Arguments
        ---------
        dataframe: pandas.Core.DataFrame
          a dataframe containing gene IDs, sample IDs
          and gene expression values

        Returns
        -------
        corr_frame: pandas.Core.DataFrame
          a dataframe of a pair-wise correlation matrix
          across samples.  Uses the Pearson correlation.
        '''

        # set sample_id to index
        pivot = dataframe.pivot(index="sample_name",
                                columns="transcript_id",
                                values="TPM")
        transpose = pivot.T
        # why do I have to resort to R????
        r_df = py2ri.py2ri_pandasdataframe(transpose)
        R.assign("p.df", r_df)
        R('''p.mat <- apply(p.df, 2, as.numeric)''')
        R('''cor.df <- cor(p.mat)''')
        r_cor = R["cor.df"]
        py_cor = py2ri.ri2py_dataframe(r_cor)
        corr_frame = py_cor

        return corr_frame
Beispiel #7
0
def extract_scholar_publications(persons):
    "Extract and return publication and citation information."

    # Import the scholar package
    scholar = importr("scholar")

    # Extract scholar publication information for each person, store
    # as standard Python dictionary
    publications = {}
    for (name, id) in persons.items():
        print("Extracting publication information for %s" % name)

        # Get basic profile info
        pubs = scholar.get_publications(id)

        # Convert to pandas dataframe
        try:
            df = pandas2ri.ri2py_dataframe(pubs)
            publications[id] = df
            print("Success")
        except:
            print("Extraction failed for %s. Ignoring data." % name)
            pass

    return publications
Beispiel #8
0
def loadAffyCelsNorm(F, d):
    F1 = robjects.vectors.StrVector(F)
    #F1 = 'c('+ ','.join(["'%s'"%i for i in F])+')'
    E1 = R_loadAffyCelsFiles(F1, d)
    X1 = pandas2ri.ri2py_dataframe(E1)
    X1['cel'] = F
    return X1
def get_ds_w(path_w):

    robjects.r['load'](path_w)

    rdf = robjects.r['teste']

    dfs = []
    for i in range(1, len(rdf)):
        pd_df = pandas2ri.ri2py_dataframe(rdf[i])
        idx = pd.DatetimeIndex(pd_df.iloc[:, 0])
        idx = idx.tz_localize(None)
        pd_df.index = idx
        pd_df.index.name = 'time'
        pd_df = pd_df.iloc[:, 1:]
        pd_df = pd_df.loc[~pd_df.index.duplicated(keep='first')]

        dfs.append(pd_df)

    ds_w = xr.concat([df.to_xarray() for df in dfs], dim='sensor')
    ds_w.time.values = pd.DatetimeIndex(ds_w.time.values)
    sensor_names = list(rdf.names[1:])
    ds_w = ds_w.assign_coords(sensor=sensor_names)
    ds_w = ds_w.to_array(dim='depth')

    return ds_w
Beispiel #10
0
def zscrp(csvs):

    for a in csvs:
        a0 = pd.read_csv(a)

        robjects.r('''
         calc.zpos <- function(x) {
           ctrl.avgs <- x %>%
             group_by(rep, plt_nm) %>%
             filter(condt == "+ctrl") %>%
             dplyr::summarise(ctrlmean = mean(area),
                              ctrlstdev = sd(area))
           x.zpos <- left_join(x, ctrl.avgs, by = c("rep", "plt_nm")) %>%
             mutate(zpos = (area - ctrlmean)/ctrlstdev) %>%
             select(-c(ctrlmean, ctrlstdev))

           return(x.zpos)
         }
         ''')

        r_f = robjects.globalenv['calc.zpos']
        res = r_f(a0)
        r.data('res')

        pd_df = pandas2ri.ri2py_dataframe(res)
        #        out = a.split('.csv')

        pd_df.to_csv(a, index=False)
Beispiel #11
0
def prep_exp(include_lab, include_ethdon, lag, eq_train_ratio, num_folds,
             train_thresh_year, cutoff, file_dir):
    import rpy2.robjects as robjects
    from rpy2.robjects.packages import importr
    from rpy2.robjects.lib.dplyr import DataFrame
    from rpy2.robjects.packages import STAP
    from rpy2.robjects import pandas2ri

    if eq_train_ratio:
        eq_cases_train_cols = np.array(['TRR_ID', 'is_diab'])
    else:
        eq_cases_train_cols = None

    # Read RDS files (load data table)
    read_rds = robjects.r['readRDS']
    tx_li_study = read_rds(os.path.join(file_dir, 'tx_li_formatted.rds'))
    txf_li_study = read_rds(os.path.join(file_dir, 'txf_li_formatted.rds'))

    # Merge them
    cols, cov_cols, timedep_cols = get_cols(include_lab, include_ethdon, lag,
                                            file_dir)
    with open(os.path.join(file_dir, 'R', 'functions.R'), 'r') as f:
        string = f.read()
    functions = STAP(string, 'functions')
    merged = functions.combine_tx_txf(tx_li_study, txf_li_study,
                                      np.setdiff1d(cov_cols, 'age'),
                                      timedep_cols, lag)
    df = pandas2ri.ri2py_dataframe(
        DataFrame(merged).filter('time_next_followup > time_since_transplant'))

    # Prep data for model training - only take complete ones
    subset_cols = np.concatenate((['TRR_ID', 'age', 'transplant_year'], cols, [
        'is_diab', 'time_since_transplant', 'time_next_followup',
        'time_to_diab', 'diab_time_since_tx', 'diab_in_1_year', 'diab_now'
    ]))
    df = df.dropna(subset=subset_cols)
    df_test = df[(df.transplant_year.astype(int) >= 2011)
                 & (df.time_to_diab >= 0)]
    df_nontest = df[(df.transplant_year.astype(int) < 2011)
                    & (df.transplant_year.astype(int) >= train_thresh_year) &
                    (df.time_to_diab >= 0)]
    if cutoff:
        df_nontest = df_nontest[df_nontest.transplant_year.astype(int) +
                                df_nontest.time_since_transplant < 2011]

    if num_folds > 0:
        nontest_y = df_nontest.drop_duplicates(
            subset=['TRR_ID', 'is_diab']).is_diab
        caret = importr('caret')
        folds = caret.createFolds(nontest_y.values, num_folds, False)
    else:
        folds = None

    return {
        'test': df_test,
        'train': df_nontest,
        'cols': cols,
        'eq_cases_train_cols': eq_cases_train_cols,
        'folds': folds
    }
Beispiel #12
0
def _convert_to_python(x):
    if isinstance(x, DataFrame):
        return pandas2ri.ri2py_dataframe(x)
    elif isinstance(x, ListVector) or isinstance(x, Vector):
        return [_convert_to_python(item) for item in x]
    else:
        return np.array(x)
Beispiel #13
0
    def test1(self):

        rkt = rpackages.importr('rkt')

        nyear = 4
        nseas = 5
        year = np.repeat(np.arange(2000, 2000 + nyear), nseas)
        dekad = np.tile(1 + np.arange(nseas), nyear)
        data = np.random.rand(nseas * nyear) + np.arange(nseas * nyear) * 0.1

        if 1:
            year = robjects.IntVector(year)
            dekad = robjects.IntVector(dekad)
            data = robjects.FloatVector(data)
        else:
            year = rpyn.numpy2ri(year)
            dekad = rpyn.numpy2ri(dekad)
            data = rpyn.numpy2ri(data)

        print(year)
        print(dekad)
        print(data)

        self.res = rkt.rkt(year, data, dekad)
        print(self.res)

        df = pandas2ri.ri2py_dataframe(rw.res).transpose()
        df.columns = self.res.names
        df = df[['sl', 'S', 'B', 'varS', 'tau']]

        print(pd.concat([df, df, df]))
        self.df = df
Beispiel #14
0
def _convert_to_python(x):
    if isinstance(x, DataFrame):
        return pandas2ri.ri2py_dataframe(x)
    elif isinstance(x, ListVector) or isinstance(x, Vector):
        return [_convert_to_python(item) for item in x]
    else:
        return np.array(x)
Beispiel #15
0
 def run_deseq2(self, exp_lib_list, ctr_lib_list, size_factors,
                pairwise_replicates):
     self._count_df = np.round(self._count_df, decimals=0)
     self._count_df = self._count_df.astype(int)
     conds = ["exp"] * len(exp_lib_list) + ["ctr"] * len(ctr_lib_list)
     if pairwise_replicates:
         samples = list(range(1, len(exp_lib_list) + 1)) + list(
             range(1, len(ctr_lib_list) + 1))
         colData = robjects.DataFrame({
                 "conditions": robjects.StrVector(conds),
                 "samples": robjects.StrVector(samples)})
         design = Formula('~ samples + conditions')
     else:
         colData = robjects.DataFrame(
                 {"conditions": robjects.StrVector(conds)})
         design = Formula('~ conditions')
     r_count_df = robjects.DataFrame(self._count_df)
     r_count_df.colnames = robjects.rinterface.NULL
     dds = r.DESeqDataSetFromMatrix(countData=r_count_df,
                                    colData=colData, design=design)
     if size_factors is None:
         dds = r.estimateSizeFactors(dds)
     else:
         assign_sf = r["sizeFactors<-"]
         dds = assign_sf(object=dds, value=robjects.FloatVector(
             size_factors))
     dds = r.estimateDispersions(dds, quiet=True)
     dds = r.nbinomWaldTest(dds, quiet=True)
     size_factors = pd.Series(r.sizeFactors(dds),
                              index=self._count_df.columns)
     results = r.results(dds, contrast=robjects.StrVector(
         ("conditions", "exp", "ctr")), altHypothesis="greater")
     results_df = pandas2ri.ri2py_dataframe(r['as.data.frame'](results))
     results_df.index = self._count_df.index
     return(results_df, size_factors)
def deaScranDESeq2(counts, conds, comparisons, alpha, scran_clusters=False):
    """Makes a call to DESeq2 with SCRAN to
    perform D.E.A. in the given
    counts matrix with the given conditions and comparisons.
    Returns a list of DESeq2 results for each comparison
    """
    results = list()
    n_cells = len(counts.columns)
    try:
        pandas2ri.activate()
        deseq2 = RimportLibrary("DESeq2")
        scran = RimportLibrary("scran")
        multicore = RimportLibrary("BiocParallel")
        multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))
        as_matrix = r["as.matrix"]
        # Create the R conditions and counts data
        r_counts = pandas2ri.py2ri(counts)
        cond = robjects.StrVector(conds)
        r_call = """
            function(r_counts) {
                sce = SingleCellExperiment(assays=list(counts=r_counts))
                return(sce)
            }
        """
        r_func = r(r_call)
        sce = r_func(as_matrix(r_counts))
        if scran_clusters:
            r_clusters = scran.quickCluster(as_matrix(r_counts), max(n_cells/10, 10))
            min_cluster_size = min(Counter(r_clusters).values())
            sizes = list(set([round((min_cluster_size/2) / i) for i in [5,4,3,2,1]]))
            sce = scran.computeSumFactors(sce, clusters=r_clusters, sizes=sizes, positive=True)
        else:
            sizes = list(set([round((n_cells/2) * i) for i in [0.1,0.2,0.3,0.4,0.5]]))
            sce = scran.computeSumFactors(sce, sizes=sizes, positive=True)   
        sce = r.normalize(sce)
        dds = r.convertTo(sce, type="DESeq2")
        r_call = """
            function(dds, conditions){
                colData(dds)$conditions = as.factor(conditions)
                design(dds) = formula(~ conditions)
                return(dds)
            }
        """
        r_func = r(r_call)
        dds = r_func(dds, cond)
        dds = r.DESeq(dds)
        # Perform the comparisons and store results in list
        for A,B in comparisons:
            result = r.results(dds, contrast=r.c("conditions", A, B), alpha=alpha)
            result = r['as.data.frame'](result)
            genes = r['rownames'](result)
            result = pandas2ri.ri2py_dataframe(result)
            # There seems to be a problem parsing the rownames from R to pandas
            # so we do it manually
            result.index = genes
            results.append(result)
        pandas2ri.deactivate()
    except Exception as e:
        raise e
    return results
def apply_transferFunction_metric(r_stream1, r_stream2, evalresp1, evalresp2):
    """"
    Invoke a named "correlation" R metric and convert the R dataframe result into
    a Pandas dataframe.
    :param r_stream1: an r_stream object
    :param r_stream2: an r_stream object
    :param metric_function_name: the name of the set of metrics
    :return:
    """
    R_function = robjects.r('IRISMustangMetrics::transferFunctionMetric')
    
    # NOTE:  Conversion of dataframes only works if you activate but we don't want conversion
    # NOTE:  to always be automatic so we deactivate() after we're done converting.
    pandas2ri.activate()
    r_evalresp1 = pandas2ri.py2ri_pandasdataframe(evalresp1)
    r_evalresp2 = pandas2ri.py2ri_pandasdataframe(evalresp2)
    pandas2ri.deactivate()
    
    # TODO:  Can we just activate/deactivate before/after R_function() without converting
    # TODO:  r_evalresp1/2 ahead of time?
    
    # Calculate the metric
    r_metriclist = R_function(r_stream1, r_stream2, r_evalresp1, r_evalresp2)
    r_dataframe = _R_metricList2DF(r_metriclist)
    pandas2ri.activate()
    df = pandas2ri.ri2py_dataframe(r_dataframe)
    pandas2ri.deactivate()
    
    # Convert columns from R POSIXct to pyton UTCDateTime
    df.starttime = df.starttime.apply(UTCDateTime)
    df.endtime = df.endtime.apply(UTCDateTime)
    return df
Beispiel #18
0
def getResult(request):
    # 获取所有的待计算数据, 并转成R可以读取的格式
    ListingId = request.POST.getlist('ListingId', [])
    Title = request.POST.getlist('Title', [])
    inputAmount = request.POST['inputAmount']
    Months = request.POST.getlist('Months', [])
    CreditCode = request.POST.getlist('CreditCode', [])
    Rate = request.POST.getlist('Rate', [])
    data = rlc.OrdDict([('ListingId', rob.StrVector(ListingId)),
                        ('Title', rob.StrVector(Title)),
                        ('inputAmount',
                         rob.IntVector([inputAmount] * len(ListingId))),
                        ('Months', rob.IntVector(Months)),
                        ('CreditCode', rob.StrVector(CreditCode)),
                        ('Rate', rob.FloatVector(Rate))])
    inputCalDataFrame = rob.DataFrame(data)
    """导入R"""
    rFilePath = os.path.dirname(os.path.abspath(__file__)) + '/DECISION.R'
    rob.r.source(rFilePath)
    decision = rob.globalenv['DECISION'](inputCalDataFrame)
    decisionDataFrame = pandas2ri.ri2py_dataframe(
        decision)  # 转为Python的DataFrame格式
    """/导入R """
    # 转换为输出结果
    inputAmount = list(decisionDataFrame['inputAmount'])[0]
    resultList = []
    for index, row in decisionDataFrame.iterrows():
        resultList.append(row.to_dict())

    return render(request, 'result.html', locals())
Beispiel #19
0
def get_enrichment_GO (input_vector,pcutoff=0.05,adjustmethod="BH",
                    qcutoff=0.2,ont = "BP",input="SYMBOL",
                    readable = True):

    """
    Gene Ontology Enrichment Analysis (clusterProfiler Bioconductor)

    Args:
        input_vector: (obj:list) gene ids str format
        pcutoff: p-value threshold
        adjustmethod: Multiple Testing Correction method
                      one of "holm", "hochberg", "hommel", "bonferroni",
                      "BH", "BY", "fdr", "none"
        input: 'SYMBOL' or 'ENTREZ'
        ont: Gene Ontology Category
             "BP": biological process
             "CC": cellular compartment
             "MF: molecular function

    Returns:
        df: DataFrame with enrichment results
    """

    enrich = robjects.r['enrichment_test_GO']
    genes = robjects.StrVector(input_vector)
    enrichment = pandas2ri.ri2py_dataframe(enrich(genes,
                                                  adjustmethod,
                                                  pcutoff,
                                                  qcutoff,
                                                  ont=ont,
                                                  input=input,
                                                  readable=readable))
    return(enrichment)
Beispiel #20
0
def _edger_func_exacttest(the_data, the_groups, fdr=0.01, lfc=1, pair=None, return_full=False):
    """
    Run edgeR DE analysis without fitting a GLM. Instead, we just compare two groups. Only a single factor is supported.
    :param the_data:
    :param the_groups:
    :param fdr:
    :param lfc:
    :param pair: An iterable of two group names. If None, compare the first two groups.
    :return:
    """
    if pair is None:
        lvl, fct = pd.factorize(the_groups)
        pair = fct[:2]
    rpair = robjects.StrVector(pair)
    rdata = pandas2ri.py2ri(the_data)
    rgroups = robjects.FactorVector(the_groups)
    y = r("DGEList")(rdata, group=rgroups)
    y = r("calcNormFactors")(y)
    y = r("estimateDisp")(y)
    et = r('exactTest')(y, rpair)
    if return_full:
        toptags = r('topTags')(et, n=r('Inf'), **{'p.value': 1.})
    else:
        toptags = r('topTags')(et, n=r('Inf'), **{'p.value': fdr})
    if len(toptags) == 0:
        return pd.DataFrame(columns=toptags_cols)
    else:
        tt = pandas2ri.ri2py_dataframe(toptags[toptags.names.index('table')])
        if lfc is not None:
            tt = tt.loc[tt.loc[:, 'logFC'].abs() >= lfc]
        return tt
Beispiel #21
0
def get_enrichment_Reactome (input_vector,pcutoff=0.05,adjustmethod="BH",
                    qcutoff=0.2, min_gs_size = 10, max_gs_size=500,
                    organism="human"):

    """
    Reactome Enrichment Analysis (ReactomePA Bioconductor)

    Args:
        input_vector: (obj:list) gene entrez ids str format
        pcutoff: p-value threshold
        adjustmethod: Multiple Testing Correction method
                      one of "holm", "hochberg", "hommel", "bonferroni",
                      "BH", "BY", "fdr", "none"

    Returns:
        df: DataFrame with enrichment results
    """

    enrich = robjects.r['enrichment_test_Reactome']
    genes = robjects.StrVector(input_vector)
    enrichment = pandas2ri.ri2py_dataframe(enrich(genes,
                                                  pcutoff,adjustmethod,
                                                  qcutoff,
                                                  min_gs_size = 10,
                                                  max_gs_size=500))
    return(enrichment)
Beispiel #22
0
    def ddx(self, contrasts=None, formula=None):
        if contrasts is None:
            contrasts = self.contrasts
        if formula is None:
            formula = "~" + "+".join(self.contrasts)

        df = self.data["Transcriptome Profiling"]['counts'].astype(int)
        design = self.metadata[contrasts].reindex(df.columns).reset_index()
        formula = Formula(formula)

        DEG = pandas2ri.ri2py_dataframe(
            DE_Genes(counts_df=pandas2ri.py2ri(df),
                     design_matrix=pandas2ri.py2ri(design),
                     design_formula=formula)).set_index("gene")

        #    # Characteristic Direction (Multivariate statistical method)
        #    # 0 excluded, 1 is control, 2 is perturbation
        #    classes = self.metadata[contrasts]

        #    # Calculate differential expression / methylation
        #    sig_features = geode.chdir(data = self.dataframe.values,
        #                 sampleclass = classes,
        #                 genes = self.dataframe.index,
        #                 gamma = 1., # smooths covariance and reduces noise
        #                 sort = True,
        #                 calculate_sig = True,
        #                 nnull = 100,
        #                 sig_only = True,
        #                 norm_vector = False)

        return DEG  #,  pd.DataFrame(sig_features)
Beispiel #23
0
    def mic(self):
        """Runs MIC analysis.

        Runs MIC analysis using the provided config file, or if no config file
        is provided, using fst.yaml in the current working directory. Will
        prompt if output directory already exists (use/rename/cancel) unless
        called with the -noprompt flag, in which case it will reuse the output
        directory. Will prompt if output file already exists
        (overwrite/rename/cancel) unless called with the -noprompt flag, in
        which case it will overwrite the output file.
        """
        # create output dir
        outdir = os.path.join(self._config['results_directory'], 'output_mic')
        self.__careful_mkdir(outdir)
        # prepare data
        filtered = _filter_by_variance(self.df_x)
        # call R
        # TODO just a sec and we'll probably switch this to minepy
        minerva = importr('minerva')
        # pylint: disable=no-member
        mine_out = minerva.mine(filtered.values)
        # pylint: enable=no-member
        mic_out = pandas2ri.ri2py_dataframe(mine_out.rx2(1))
        # restore names
        names_list = list(filtered.columns.values)
        mic_out.rename(columns=lambda x: names_list[int(x)],
                       index=lambda x: names_list[int(x)],
                       inplace=True)
        # save and return
        mic_out.to_csv(os.path.join(outdir, 'MIC.csv'), index_label='feature')
	def run(self):
		self.LOG.info("Starting to simulate data...")
		t1 = time()
		simulate = self.get_r_method(c.SIMULATE_R_FILE, 'simulate')
		sim_data = simulate(self.model, self.nrows)
		pandas_df = pandas2ri.ri2py_dataframe(sim_data)
		self.LOG.info("Data simulation complete in %d sec." % (time() - t1))
		return pandas_df.astype(int, copy=False)
Beispiel #25
0
def get_tf_factor(var, from_to, value_col="IMPUTED"):
    r_var = r['as.character'](robjects.FactorVector(var))
    r_from_to = robjects.IntVector(from_to)
    data = r['tf_factor_tbl'](r['as.character'](r_var), r_from_to, value_col)
    data = pandas2ri.ri2py_dataframe(data)
    print(var[0])
    gc.collect()
    return data
Beispiel #26
0
 def run(self):
     self.LOG.info("Starting to simulate data...")
     t1 = time()
     simulate = self.get_r_method(c.SIMULATE_R_FILE, 'simulate')
     sim_data = simulate(self.model, self.nrows)
     pandas_df = pandas2ri.ri2py_dataframe(sim_data)
     self.LOG.info("Data simulation complete in %d sec." % (time() - t1))
     return pandas_df.astype(int, copy=False)
Beispiel #27
0
def fetch_data(app_struct, input_date, offset):
    # Run the R engine
    rcode = generate_rcode(app_struct.token, input_date, input_date, offset,
                           app_struct.sandbox)
    r(rcode)

    # Get the result
    table = robjects.r['table']
    return pandas2ri.ri2py_dataframe(table)
Beispiel #28
0
 def _run_gsea(df, genesets, method='ssgsea', verbose=False, **kwargs):
     rdata = r('as.matrix')(df)
     rgenesets = robjects.ListVector(genesets)
     res = r('gsva')(rdata, rgenesets, method=method, verbose=verbose, **kwargs)
     py_res = pandas2ri.ri2py_dataframe(res)
     py_res.index = r('rownames')(res)
     # py_res.columns = r('colnames')(res)
     py_res.columns = df.columns
     return py_res
Beispiel #29
0
 def normalized_count(self):
     normalized_count_matrix = deseq.counts_DESeqDataSet(self.dds,
                                                         normalized=True)
     normalized_count_matrix = to_dataframe(normalized_count_matrix)
     # switch back to python
     self.normalized_count_df = pandas2ri.ri2py_dataframe(
         normalized_count_matrix)
     self.normalized_count_df[self.gene_column] = self.gene_id.values
     return self.normalized_count_df
def _eval_one_setting_grf(train,
                          test,
                          n_train,
                          n_test,
                          num_trees=NUM_TREES_BASE,
                          d=1,
                          te_function=None,
                          baseline_model=None,
                          propensity_model=None,
                          covariate_model=None,
                          error_model=None,
                          binary_y=False,
                          selection_bias=None,
                          seedy=42,
                          root=PAPER_UTILS_ROOT):
    # get data
    np.random.seed(seedy)
    X, y, w, t, p, _ = make_te_data(n=n_train + n_test,
                                    d=d,
                                    te_model=te_function,
                                    baseline_model=baseline_model,
                                    covariate_model=covariate_model,
                                    propensity_model=propensity_model,
                                    binary_y=binary_y,
                                    error_model=error_model,
                                    seedy=seedy,
                                    selection_bias=selection_bias)

    # split data
    X_train, y_train, w_train, p_train, _ = _safe_split_te(
        X, y, w, p, t, train)
    X_test, _, _, _, t_test = _safe_split_te(X, y, w, p, t, test)

    # convert to R objects
    r_y = robjects.FloatVector(y_train)
    r_x = robjects.r.matrix(X_train, n_train, d)
    r_w = robjects.IntVector(w_train)
    r_p = robjects.FloatVector(p_train)
    r_x_test = robjects.r.matrix(X_test, n_test, d)

    # get function from R script
    r_source = robjects.r['source']
    r_source(root + 'grf_experiments.R')
    r_get_te_predictions = robjects.globalenv['get_te_predictions']

    r_out = r_get_te_predictions(r_x,
                                 r_y,
                                 r_w,
                                 r_p,
                                 r_x_test,
                                 num_trees=num_trees)
    out = pandas2ri.ri2py_dataframe(r_out).values

    mses = [mean_squared_error(t_test, out[:, i]) for i in range(5)]

    return mses
Beispiel #31
0
 def predict(self, X, n_draws=0, parallel=False):
     X_out = self.x_scaler.transform(X)
     dfout = pd.DataFrame(X_out, columns=X.columns)
     dfoutpath = "{}/{}.feather".format(self.outdir, uuid.uuid4())
     dfout.to_feather(dfoutpath)
     ml = self.get_ml()
     out_ = ml.predict_(self.ml_, dfoutpath, n_draws, parallel, self.pacman_call)
     pred = pandas2ri.ri2py_dataframe(out_)
     os.remove(dfoutpath)
     return self.y_scaler.inverse_transform(pred.values)
Beispiel #32
0
def _edger_func_test(fit, design, contrast_str, fdr=0.01, lfc=1, return_full=False):
    rcontrast = r('makeContrasts')(contrast_str, levels=design)
    lrt = r('glmTreat')(fit, contrast=rcontrast, lfc=lfc)
    if return_full:
        toptags = r('topTags')(lrt, n=r('Inf'), **{'p.value': 1.})
    else:
        toptags = r('topTags')(lrt, n=r('Inf'), **{'p.value': fdr})
    if len(toptags) == 0:
        return pd.DataFrame(columns=toptags_cols)
    else:
        return pandas2ri.ri2py_dataframe(toptags[toptags.names.index('table')])
Beispiel #33
0
    def predict(self, X, return_se: bool = False):
        """
        Make prediction, with or without standard errors associated
        """
        if isinstance(X, pd.DataFrame):
            X = X.values
        n, d = X.shape
        r_x = robjects.r.matrix(X, n, d)

        if return_se:
            # predict with var
            r_pred = self._grf.predict_regression_forest(
                self._estimator, newdata=r_x, estimate_variance=True)
            r_pred = np.transpose(pandas2ri.ri2py_dataframe(r_pred).values)
            return r_pred[:, 0], r_pred[:, 1]
        else:
            r_pred = self._grf.predict_regression_forest(self._estimator,
                                                         newdata=r_x)
            r_pred = pandas2ri.ri2py_dataframe(r_pred).values
            return np.transpose(r_pred[0, :])
def _edger_tmm_normalisation_cpm(count_data):
    robjects = rinterface.robjects
    pandas2ri = rinterface.robjects.pandas2ri

    rdata = pandas2ri.py2ri(count_data)
    y = robjects.r("DGEList")(rdata)
    yn = robjects.r("calcNormFactors")(y)
    cpm = pandas2ri.ri2py_dataframe(robjects.r('cpm')(yn))
    cpm.index = count_data.index
    cpm.columns = count_data.columns

    return cpm
Beispiel #35
0
def dataframe_to_pandas(r_frame):
    pd_frame = pandas2ri.ri2py_dataframe(r_frame)

    # Extract column names if possible.
    col_names = robjects.r.colnames(r_frame)
    if not type(col_names) == RNULLType:
        pd_frame.columns = col_names

    # Extract row names if possible.
    index = robjects.r.rownames(r_frame)
    if not type(index) == RNULLType:
        pd_frame.index = index

    return pd_frame
Beispiel #36
0
def dataframe_to_pandas(r_frame):
    pd_frame = pandas2ri.ri2py_dataframe(r_frame)

    # Extract column names if possible.
    col_names = robjects.r.colnames(r_frame)
    if not type(col_names) == RNULLType:
        pd_frame.columns = col_names

    # Extract row names if possible.
    index = robjects.r.rownames(r_frame)
    if not type(index) == RNULLType:
        pd_frame.index = index

    return pd_frame
Beispiel #37
0
    def MannKendall(self, data):

        rkt = rpackages.importr('Kendall')

        data = robjects.FloatVector(data)

        self.res = rkt.MannKendall(data)
        print(self.res)

        df = pandas2ri.ri2py_dataframe(self.res).transpose()
        df.columns = self.res.names
        df = df[['sl', 'S', 'B', 'varS', 'tau']]

        return df
Beispiel #38
0
    def predict_proba(self, X):
        """ Computes possible class probabilities for the input 'X'

        Parameters
        -----------
            X: pandas.DataFrame object

        Returns
        -------
            pandas.DataFrame of shape (#datapoints, 2), the possible probability of each class for each observation
        """
        if not isinstance(X, pd.DataFrame):
            raise exceptions.DataSetError("Only pandas.DataFrame as input type is currently supported")

        data_as_r_frame = self.__r_frame(self.__s_apply(X, self.__as_factor))
        results = self.__r_sbrl.predict_sbrl(self.model, data_as_r_frame)
        return pandas2ri.ri2py_dataframe(results).T
Beispiel #39
0
def peak_table(xmcs_set, filebase="peakList"):
    """Export the global peak table

    Parameters
    ----------------
    xcms_set : xcmsSet
        R xcms set.
    filebase : str
        Type of filebase to use.

    Returns
    -----------
    out : dataFrame
        xcms peak dataFrame.
    """
    peak = robjects.r["peakTable"]
    tab = peak(xmcs_set, filebase)
    df = pandas2ri.ri2py_dataframe(tab)
    df.columns = tab.colnames
    return df
Beispiel #40
0
    def calculate_result(self, scores):
        """
        Generates a csv file with the resulting assignment while it updates the status
        of the process using Celery
        """

        update_frequency = 1
        max_steps = 7
        self.update_progress(1, max_steps, update_frequency=update_frequency)
        ro.r('library(MASS)')
        self.update_progress(2, max_steps, update_frequency=update_frequency)
        ro.r('library(Matrix)')
        self.update_progress(3, max_steps, update_frequency=update_frequency)
        ro.r('library(lme4)')
        self.update_progress(4, max_steps, update_frequency=update_frequency)
        ro.r('library(Rcpp)')
        self.update_progress(5, max_steps, update_frequency=update_frequency)
        ro.r('library(arm)')
        self.update_progress(6, max_steps, update_frequency=update_frequency)

        scores_pd = pd.DataFrame(scores)
        # estimate scores
        rdf = com.convert_to_r_dataframe(scores_pd)
        ro.globalenv['scores'] = rdf

        if 'Confidence' in scores_pd.columns:
            fit_str = 'fit <- lmer(Score ~ 1 + (1 | PaperID) + (1 | PersonID), scores, weights = Confidence)'
        else:
            fit_str = 'fit <- lmer(Score ~ 1 + (1 | PaperID) + (1 | PersonID), scores)'

        ro.r(fit_str)

        ro.r('''bayes_score <- data.frame(PaperID = rownames(fixef(fit) + ranef(fit)$PaperID),
            Mean = (fixef(fit) + ranef(fit)$PaperID)[,1],
            SD = (se.ranef(fit)$PaperID)[, 1])''')

        bayes_score = pandas2ri.ri2py_dataframe(ro.r('bayes_score'))

        self.update_progress(7, max_steps, update_frequency=update_frequency)

        return bayes_score.to_csv(None, na_rep='', index=False, encoding='utf-8')
Beispiel #41
0
def deaDESeq2(counts, conds, comparisons, alpha, size_factors=None):
    """Makes a call to DESeq2 to
    perform D.E.A. in the given
    counts matrix with the given conditions and comparisons.
    Can be given size factors. 
    Returns a list of DESeq2 results for each comparison
    """
    results = list()
    try:
        pandas2ri.activate()
        deseq2 = RimportLibrary("DESeq2")
        multicore = RimportLibrary("BiocParallel")
        multicore.register(multicore.MulticoreParam(multiprocessing.cpu_count()-1))
        # Create the R conditions and counts data
        r_counts = pandas2ri.py2ri(counts)
        cond = robjects.DataFrame({"conditions": robjects.StrVector(conds)})
        design = r('formula(~ conditions)')
        dds = r.DESeqDataSetFromMatrix(countData=r_counts, colData=cond, design=design)
        if size_factors is None:
            dds = r.DESeq(dds, parallel=True, useT=True, 
                          minmu=1e-6, minReplicatesForReplace=np.inf)
        else:
            assign_sf = r["sizeFactors<-"]
            dds = assign_sf(object=dds, value=robjects.FloatVector(size_factors))
            dds = r.estimateDispersions(dds)
            dds = r.nbinomWaldTest(dds)
        # Perform the comparisons and store results in list
        for A,B in comparisons:
            result = r.results(dds, contrast=r.c("conditions", A, B), 
                               alpha=alpha, parallel=True)
            result = r['as.data.frame'](result)
            genes = r['rownames'](result)
            result = pandas2ri.ri2py_dataframe(result)
            # There seems to be a problem parsing the rownames from R to pandas
            # so we do it manually
            result.index = genes
            results.append(result)
        pandas2ri.deactivate()
    except Exception as e:
        raise e
    return results
def apply_correlation_metric(r_stream1, r_stream2, metric_function_name, *args, **kwargs):
    """"
    Invoke a named "correlation" R metric and convert the R dataframe result into
    a Pandas dataframe.
    :param r_stream1: an r_stream object
    :param r_stream2: an r_stream object
    :param metric_function_name: the name of the set of metrics
    :return:
    """
    function = 'IRISMustangMetrics::' + metric_function_name + 'Metric'
    R_function = robjects.r(function)
    pandas2ri.activate()
    r_metriclist = R_function(r_stream1, r_stream2, *args, **kwargs)  # args and kwargs shouldn't be needed in theory
    pandas2ri.deactivate()
    r_dataframe = _R_metricList2DF(r_metriclist)
    df = pandas2ri.ri2py_dataframe(r_dataframe)
    
    # Convert columns from R POSIXct to pyton UTCDateTime
    df.starttime = df.starttime.apply(UTCDateTime)
    df.endtime = df.endtime.apply(UTCDateTime)
    return df
def xml2df(url):
     # make some terrible R code
     from rpy2.robjects.packages import SignatureTranslatedAnonymousPackage
     from rpy2.robjects import pandas2ri

     string = """
     require(XML)
     require(plyr)

     getXML <- function(x) {
          xmlfile <- xmlTreeParse(x)
          temp = xmlToList(xmlfile, addAttributes = F)
          df <- ldply(temp, .fun=function(x) {data.frame(t(unlist(x)))})
          return(df)
     }
     """
     test = SignatureTranslatedAnonymousPackage(string, "test")

     # make a pandas DF out of the stupid R df
     pydf = pandas2ri.ri2py_dataframe(test.getXML(url))
     return pydf
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--test", dest="test", type="string",
                      help="supply help")

    parser.add_option("--task", dest="task", type="choice",
                      choices=["set_factors", "dichotimise_phenotype",
                               "plink_format", "select_ethnicity",
                               "merge_covariates", "subset_phenotypes"],
                      help="task to execute on phenotype file(s)")

    parser.add_option("--R-script", dest="r_script", type="string",
                      help="R script for table reformatting")

    parser.add_option("--adjustment", dest="adjust", type="choice",
                      choices=["snp"],
                      help="adjustements to make pre- or post mergeing")

    parser.add_option("--pheno-id", dest="dichot_var", type="string",
                      help="column header of variable to be dichotimised")

    parser.add_option("--reference-variable", dest="ref_level", type="string",
                      help="level of variable to be dichotimised toi set to 1")

    parser.add_option("--missing-var-label", dest="missing_label", type="string",
                      help="missing/unobserved value labels")

    parser.add_option("--id-variable", dest="id_var", type="string",
                      help="ID variable column header")

    parser.add_option("--ethnicity-id", dest="ethnic_var", type="string",
                      help="column header for variable containing "
                      "ethnicity data")

    parser.add_option("--ethnicity-label", dest="ethnic", type="string",
                      help="ethnicity label to select samples on")

    parser.add_option("--covariate-file", dest="covar_file", type="string",
                      help="a comma-separated list of files to be merged, or "
                      "a single file")

    parser.add_option("--fam-file", dest="fam_file", type="string",
                      help="Plink .fam file that specifies which samples "
                      "to subset from the phenotypes file")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    infile = argv[-1]
    if options.task == "set_factors":
        pandas2ri.activate()
        R('''source("%s")''' % options.r_script)
        R('''format <- format_phenotypes("%s")''' % infile)
        pheno_df = pandas2ri.ri2py_dataframe(R["format"])
        pheno_df["IID"] = pheno_df["f.eid"]
        cols = pheno_df.columns.tolist()
        cols = [xc if not re.search("f.eid", xc) else "FID" for xc in cols]
        # columns need to be FID, IID, ...
        cols.remove("FID")
        cols.remove("IID")
        new_cols = cols
        new_cols.insert(0, "FID")
        new_cols.insert(1, "IID")
        pheno_df.columns = new_cols
        pheno_df.to_csv(options.stdout, sep="\t", index_col=None)

    elif options.task == "dichotimise_phenotype":
        # catch situation where delimiter does is not tab
        try:
            df = pd.read_table(infile, sep="\t", header=0, index_col=None)
            assert len(df.columns) > 1
        except AssertionError:
            df = pd.read_table(infile, sep="\s+", index_col=None)

        var = pd.Series(df[options.dichot_var].copy(), dtype=np.int64)
        ref = np.int64(options.ref_level)
        mask = var.isin([ref])
        # set NA or unobserved to missing, assume missing value is -9 (Plink standard)
        nas = np.isnan(var)
        var[~mask] = 1
        var[mask] = 2
        var[nas] = -9
        # there maybe multiple missing/unobserved data categories to deal with
        missing = options.missing_label.split(",")
        if len(missing) > 1:
            miss_mask = var.isin(missing)
        else:
            miss_mask = var.isin(missing)
        var[miss_mask] = -9
        # output in plink format
        p_df = df.loc[:, ("FID", "IID", options.dichot_var)]
        p_df[options.dichot_var] = pd.Series(var, dtype=np.int64)
        p_df.index = p_df["FID"]
        p_df.drop(labels="FID", axis=1, inplace=True)
        p_df.to_csv(options.stdout, sep="\t", index_col=None)

    elif options.task == "plink_format":
        # add IID and FID columns based on individual IDs        
        pheno_df = pd.read_table(infile, sep="\t", header=0, index_col=None)
        pheno_df["IID"] = pheno_df[options.id_var]
        pheno_df["FID"] = pheno_df[options.id_var]
        cols = pheno_df.columns.tolist()
        cols = [xc for xc in cols if not re.search(options.id_var, xc)]
        # columns need to be FID, IID, ...
        cols.remove("FID")
        cols.remove("IID")
        new_cols = cols
        new_cols.insert(0, "FID")
        new_cols.insert(1, "IID")
        resort_df = pheno_df[new_cols]
        resort_df.index = resort_df["FID"]
        resort_df.drop(labels="FID", axis=1, inplace=True)
        resort_df.to_csv(options.stdout, sep="\t")

    elif options.task == "select_ethnicity":
        # select ethnicity
        pheno_df = pd.read_table(infile, sep="\t", header=0, index_col=None)
        ethnic_var = pheno_df.loc[:, options.ethnic_var].copy()
        ethnic_mask = ethnic_var == int(options.ethnic)
        select_indv = ethnic_var[ethnic_mask].index
        filter_df = pheno_df.loc[select_indv, :]
        filter_df.index = filter_df["FID"]
        filter_df.drop(labels="FID", axis=1, inplace=True)
        filter_df.to_csv(options.stdout, sep="\t", index_col=None)

    elif options.task == "merge_covariates":
        if len(options.covar_file.split(",")) > 1:
            filelist = options.covar_file.split(",")
            df = pd.read_table(filelist.pop(0), sep="\t",
                               index_col=None, header=0)
            if options.adjust == "snp":
                re_snp = re.compile(".raw")
                snp_file = [fil for fil in filelist if re.search(re_snp,
                                                             fil)][0]
                _df = pd.read_table(snp_file, sep="\t", header=0,
                                    index_col=None)

                cols = _df.columns[6:]
                real_cols = list(_df.columns[:6])
                snp_cols = [sc.split("_")[:-1][0] for sc in cols]

                # list methods work in place, don't assign as a new variable
                real_cols.extend(snp_cols)
                _df.columns = real_cols
                df = pd.merge(left=df, right=_df,
                              on=["FID", "IID"],
                              how='inner')
                try:
                    filelist.remove(snp_file)
                except:
                    pass

            for fle in filelist:
                _df = pd.read_table(fle, sep="\t", header=0,
                                    index_col=None)
                df = pd.merge(left=df, right=_df,
                              on=["FID", "IID"],
                              how='inner')
            # python outputs NA as blank when writing to stdout,
            # plink expects values, use string NAs
            df = df.fillna("NA")
            df.index = df["FID"]
            df.drop(["FID"], inplace=True, axis=1)

            df.to_csv(options.stdout, index_col=0,
                      index_label="FID", sep="\t")
        else:
            E.warn("only a single covariates file provided."
                   "No merging possible, exiting")

    elif options.task == "subset_phenotypes":
        fam_df = pd.read_table(options.fam_file, sep=None,
                               index_col=None, header=None)
        fam_df.columns = ["FID", "IID", "PAT", "MAT", "SEX",
                          "PHENO"]

        pheno_df = pd.read_table(infile, sep=None,
                                 index_col=0, header=0)
        fam_ids = fam_df["FID"]
        sub_pheno = pheno_df.loc[fam_ids]
    
        sub_pheno.to_csv(options.stdout, index_col=0,
                         index_label="FID", sep="\t")
    else:
        pass
        

    # write footer and output benchmark information.
    E.Stop()
Beispiel #45
0
pop_freq = r('bs_imp$pop.freq')

pop_freq_names = pandas2ri.ri2py(pop_freq.names)


# %%
def compute_He(elem):
    He = 2
    for x in elem:
        He *= x
    return He

He_dict = {}

for i, name in enumerate(pop_freq_names):
    af = pandas2ri.ri2py_dataframe(pop_freq.rx2(name))
    af.columns = [x+1 for x in af.columns]
    He_dict[name] = af.apply(compute_He).to_dict()
    
    if i % 10000 == 0:
        print("at %d" % i)

# %%
He = pd.DataFrame(He_dict).T

# %%
He.columns = [popid_map[x] for x in He.columns]

# %%
Ho_He = Ho.join(He, lsuffix = "_Ho", rsuffix = "_He")
def testColoc(trait1, trait2, trait1_type, trait2_type,
              maf_table, gene_list=None,
              trait1_prev=None, trait2_prev=None,
              chromosome=None, start=None, end=None):
    '''
    Perform colocalization testing between two traits.

    Arguments
    ------
    trait1: pandas.core.dataframe
      A data frame containing the summary statistics for
      trait 1

    trait2: pandas.core.dataframe
      A data frame containing the summary statistics for
      trait 2

    trait1_type: string
      Either `cc` or `quant`, denoting the type of trait 1

    trait2_type: string
      Either `cc` or `quant`, denoting the type of trait 2

    maf_table: pandas.core.dataframe
      Data frame containing SNP IDs and MAF

    gene_list: list
      A list of genes to restirct analysis to.  Either trait 1
      or trait 2 must be a quantitative trait

    trait1_prev: float
      Prevalence of trait1 if binary

    trait2_prev: float
      Prevalence of trait2 if binary

    chromosome: int
      Chromosome to restrict the colocalisation analysis to

    start: int
      start co-ordinate to restrict analysis to.  Must also
    provide `chromosome`. 1-based index, closed [start, end]

    end: int
      end co-ordinate to restrict analysis to.  Must also
      provide `chromosome` and `start`. 1-based index, closed
      [start, end]

    Returns
    -------
    coloc_results: pandas.core.dataframe
      A data frame containing each region (e.g. genes) and
      the posterior probability in favour of each hypothesis:
      H0 - no association with trait1 or trait2, and no colocalisation
      H1 - association with trait 1, but no colocalisation
      H2 - association with trait2, but no colocalisation
      H3 - association with trait1 and 2, but no colocalisation
      H4 - association with trait1 and 2, and colocalised
    '''

    # push all elements into the R environment
    R('''sink(file="sink.text")''')
    R('''suppressPackageStartupMessages(library(coloc))''')
    R('''source("/ifs/devel/projects/proj045/gwas_pipeline/R_scripts/coloQtl.R")''')
    
    E.info("Pushing results tables into R environment")
    py2ri.activate()
    r_trait1 = py2ri.py2ri_pandasdataframe(trait1)
    R.assign("r.trait1", r_trait1)

    r_trait2 = py2ri.py2ri_pandasdataframe(trait2)
    R.assign("r.trait2", r_trait2)

    r_maf = py2ri.py2ri_pandasdataframe(maf_table)
    R.assign("r.mafs", r_maf)

    if trait1_prev:
        R.assign("trait1.prev", trait1_prev)
    else:
        R('''trait1.prev <- NULL''')

    if trait2_prev:
        R.assign("trait2.prev", trait2_prev)
    else:
        R('''trait2.prev <- NULL''')

    E.info("Checking for gene list")
    if gene_list:
        E.info("Gene list contains {} genes".format(len(set(gene_list))))
        r_genes = ro.StrVector([rx for rx in set(gene_list)])
        R.assign("gene.list", r_genes)

        E.info("Iterating over gene list")
        R('''res.df <- geneListSnpColocQtl(gene_list=gene.list,'''
          '''results_table=r.trait1, MAF_table=r.mafs, '''
          '''eqtl_table=r.trait2, trait_type="%(trait1_type)s", '''
          '''prev=trait1.prev)''' % locals())

        R('''genes <- rownames(res.df)''')
        genes = [gx for gx in R["genes"]]

    else:
        R('''res.df <- TwoTraitSnpColocQtl(trait1_table=r.trait1,'''
          '''trait2_table=r.trait2, MAF_table=r.mafs, '''
          '''trait1_type="%(trait1_type)s", trait2_type="%(trait2_type)s",'''
          '''prev1=trait1.prev, prev2=trait2.prev)''')
        
        R('''genes <- dim(res.df)[1]''')
        genes = R["genes"]

    coloc_results = py2ri.ri2py_dataframe(R["res.df"])
    coloc_results.index = genes

    coloc_results.columns = ["nSNPs", "H0.PP", "H1.PP", "H2.PP", "H3.PP", "H4.PP"]

    R('''sink(file=NULL)''')

    return coloc_results
def pythonWrapper4Pet(dataframe, snps, covars,
                      trait1, trait2, model1,
                      model2, resamples=999):
    '''
    This is just Python wrapper around the R code
    for the PET calculations

    '''
    py2ri.activate()

    E.info("Checking regression models")
    if model1 == "logistic":
        R('''trait1.mod <- binomial''')
        R('''trait1.link <- "logit" ''')
    elif model1 == "linear":
        R('''trait1.mod <- gaussian''')
        R('''trait1.link <- "identity" ''')

    if model2 == "logistic":
        R('''trait2.mod <- binomial''')
        R('''trait2.link <- "logit" ''')
    elif model2 == "linear":
        R('''trait2.mod <- gaussian''')
        R('''trait2.link <- "identity" ''')
    E.info("Running {} regression for trait 1: {}".format(model1,
                                                          trait1))
    E.info("Running {} regression for trait 2: {}".format(model2,
                                                          trait2))

    R('''source("/ifs/devel/projects/proj045/gwas_pipeline/R_scripts/PET_functions.R")''')
    E.info("Pushing data objects into the R environment")
    # push everything into the R environment
    r_df = py2ri.py2ri_pandasdataframe(dataframe)
    R.assign("data.df", r_df)

    r_snps = ro.StrVector([sp for sp in snps])
    R.assign("snp.list", r_snps)

    E.info("Parsing covariates")
    covars = covars.split(",")
    r_covar = ro.StrVector([cv for cv in covars])
    R.assign("covar.list", r_covar)
    E.info("{} covariates found to adjust "
           "in regression  models".format(len(covars)))

    # clean up, replacing "missing values" with NAs for R
    R('''data.df[data.df == -9] <- NA''')
    R('''pet_results <- list()''')

    # loop over all SNP, calculate PCC and p-value
    # this takes a long time <- need to think of speed ups
    # possible Python-pure implementation, i.e. with LIMIX?
    E.info("Iteratively calculating PCC for all SNPs")
    R('''results <- loopPET(data.df=data.df, trait1="%(trait1)s", trait2="%(trait2)s", '''
      '''trait1.link=trait1.link, trait2.link=trait2.link, '''
      '''trait1.mod=trait1.mod, trait2.mod=trait2.mod, covars=covar.list,'''
      '''resamples=%(resamples)i, snp.list=snp.list)''' % locals())

    R('''out.res <- data.frame(do.call(rbind, results))''')
    R('''colnames(out.res) <- c("PCC", "pvalue")''')
    py_out = py2ri.ri2py_dataframe(R["out.res"])

    return py_out    
def applymem(df, discarded_seasons=None, wdw_method=2, lower_bound=5.0):
    rdf = pandas2ri.py2ri(df)
    seasons = sorted(list(df.columns.drop(['UF', 'epiweek'])))[:-1]
    # Discard 2009 season if present:
    seasons = sorted(set(seasons).difference(discarded_seasons))
    rseasons = ro.StrVector(seasons)
    ro.globalenv['df'] = rdf
    ro.globalenv['seasons'] = rseasons
    # # Method for obtaining typical time series evolution (default 2)
    # ro.globalenv['par.type.curve'] = 2
    # # Method for obtaining pre/post-epidemic threshold (default 4)
    # ro.globalenv['par.type.threshold'] = 2
    # # Method for obtaining intensity thresholds (default 4)
    # ro.globalenv['par.type.intensity'] = 2
    # # Method for obtaining outbreak start and length (default 6)
    # ro.globalenv['par.type.other'] = 2
    # # Total number of points to obtain pre/post-threshold (will take n/seasons from each)
    # ro.globalenv['par.n.max'] = 30
    # # Confidence interval for modelled curve
    # ro.globalenv['par.level.curve'] = 0.90
    # # Confidence interval for pre/post-thresold
    # ro.globalenv['par.level.threshold'] = 0.95
    # # Quantiles for intensity thresholds
    # ro.globalenv['par.level.intensity'] = ro.FloatVector([0.40, 0.90, 0.975])
    #
    # epimemrslt = ro.r('memmodel(i.data=subset(df, select=seasons), i.type.curve=par.type.curve,' +
    #                   'i.type.threshold=par.type.threshold, i.type.intensity=par.type.intensity,' +
    #                   'i.type.other=par.type.other, i.n.max=par.n.max, i.level.curve=par.level.curve,' +
    #                   'i.level.threshold=par.level.threshold, i.level.intensity=par.level.intensity)')

    ro.globalenv['df'] = rdf
    ro.globalenv['seasons'] = rseasons
    ro.globalenv['par.method'] = wdw_method
    ro.globalenv['par.type.curve'] = 2
    ro.globalenv['par.n.max'] = 20
    ro.globalenv['par.level.curve'] = 0.95
    ro.globalenv['par.level.threshold'] = 0.95
    ro.globalenv['par.type.intensity'] = 6
    ro.globalenv['par.level.intensity'] = ro.FloatVector([0.40, 0.90, 0.975])
    epimemrslt = ro.r('memmodel(i.data=subset(df, select=seasons), i.type.curve=par.type.curve, i.method=par.method,' +
                      'i.n.max=par.n.max, i.level.curve=par.level.curve, i.level.threshold=par.level.threshold,' +
                      'i.type.intensity=par.type.intensity, i.level.intensity=par.level.intensity)')

    # Pre-epidemic threshold:
    epithreshold = max(lower_bound, pandas2ri.ri2py_dataframe(epimemrslt.rx2('pre.post.intervals')).loc[0, 2])
    typrealcurve = pandas2ri.ri2py_dataframe(epimemrslt.rx2('typ.real.curve'))

    # Check for seasons below threshold:
    dropseasons = set()
    for s in seasons:
        if df[s].max() < epithreshold:
            dropseasons.add(s)
    # Drop seasons below threshold and rerun algorithm:
    episeasons = list(seasons)
    if len(dropseasons) > 0 and len(dropseasons) < len(seasons):
        episeasons = sorted(list(set(seasons).difference(dropseasons)))
        ro.globalenv['episeasons'] = ro.StrVector(episeasons)

        # epimemrslt = ro.r('memmodel(i.data=subset(df, select=episeasons), i.type.curve=par.type.curve,' +
        #                   'i.type.threshold=par.type.threshold, i.type.intensity=par.type.intensity,' +
        #                   'i.type.other=par.type.other, i.n.max=par.n.max, i.level.curve=par.level.curve,' +
        #                   'i.level.threshold=par.level.threshold, i.level.intensity=par.level.intensity)')

        epimemrslt = ro.r('memmodel(i.data=subset(df, select=episeasons), i.type.curve=par.type.curve,' +
                          'i.method=par.method,' +
                          'i.n.max=par.n.max, i.level.curve=par.level.curve, i.level.threshold=par.level.threshold,' +
                          'i.type.intensity=par.type.intensity, i.level.intensity=par.level.intensity)')

    # Store results in python dictionary of objects
    pyepimemrslt = {}
    rovector = [ro.vectors.StrVector, ro.vectors.IntVector, ro.vectors.FloatVector, ro.vectors.Vector]
    for name in epimemrslt.names:
        rdata = epimemrslt.rx2(name)
        if name == 'call':
            pyepimemrslt.update({name: str(rdata)})
        elif type(rdata) in rovector:
            pyepimemrslt.update({name: pandas2ri.ri2py_vector(rdata)})
        else:
            pyepimemrslt.update({name: pandas2ri.ri2py_dataframe(rdata)})

    # typ.curve is the typical curve obtained from averaging over epidemic seasons with time rescaled
    # so that the start of the epidemic period coincides with mean.start
    pyepimemrslt['typ.curve'].rename(columns={0: 'baixo', 1: 'mediano', 2: 'alto'}, inplace=True)
    pyepimemrslt['typ.curve']['mediano'].fillna(0, inplace=True)
    pyepimemrslt['typ.curve']['baixo'] = pyepimemrslt['typ.curve']['baixo'].where(
        pyepimemrslt['typ.curve']['baixo'] >= 0,
        other=0)
    pyepimemrslt['typ.curve']['baixo'] = pyepimemrslt['typ.curve']['baixo']. \
        where((-pyepimemrslt['typ.curve']['baixo'].isnull()), other=pyepimemrslt['typ.curve']['mediano'])
    pyepimemrslt['typ.curve']['alto'] = pyepimemrslt['typ.curve']['alto']. \
        where((-pyepimemrslt['typ.curve']['alto'].isnull()), other=pyepimemrslt['typ.curve']['mediano'])

    pyepimemrslt['typ.threshold.curve'].rename(columns={0: 'baixo', 1: 'mediano', 2: 'alto'}, inplace=True)
    pyepimemrslt['typ.threshold.curve']['mediano'].fillna(0, inplace=True)
    pyepimemrslt['typ.threshold.curve']['baixo'] = pyepimemrslt['typ.threshold.curve']['baixo']. \
        where(pyepimemrslt['typ.threshold.curve']['baixo'] >= 0, other=0)
    pyepimemrslt['typ.threshold.curve']['baixo'] = pyepimemrslt['typ.threshold.curve']['baixo']. \
        where((-pyepimemrslt['typ.threshold.curve']['baixo'].isnull()),
              other=pyepimemrslt['typ.threshold.curve']['mediano'])
    pyepimemrslt['typ.threshold.curve']['alto'] = pyepimemrslt['typ.threshold.curve']['alto']. \
        where((-pyepimemrslt['typ.threshold.curve']['alto'].isnull()),
              other=pyepimemrslt['typ.threshold.curve']['mediano'])

    pyepimemrslt['pre.post.intervals'].rename(index={0: 'pre', 1: 'post'}, inplace=True)

    # typ.real.curve is the typical curve without time shift, that is, respecting the original weeks from data
    # this curve is better to keep all seasons, not only the epidemic ones.
    pyepimemrslt['typ.real.curve'] = typrealcurve.copy()
    pyepimemrslt['typ.real.curve'].rename(columns={0: 'baixo', 1: 'mediano', 2: 'alto'}, inplace=True)
    pyepimemrslt['typ.real.curve']['mediano'].fillna(0, inplace=True)
    pyepimemrslt['typ.real.curve'].loc[pyepimemrslt['typ.real.curve']['baixo'] < 0, 'baixo'] = 0
    pyepimemrslt['typ.real.curve']['baixo'] = pyepimemrslt['typ.real.curve']['baixo']. \
        where((-pyepimemrslt['typ.real.curve']['baixo'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano'])
    pyepimemrslt['typ.real.curve']['alto'] = pyepimemrslt['typ.real.curve']['alto']. \
        where((-pyepimemrslt['typ.real.curve']['alto'].isnull()), other=pyepimemrslt['typ.real.curve']['mediano'])
    newcols = {}
    for k, v in enumerate(episeasons):
        newcols[k] = str(v) + ' transladado'
    pyepimemrslt['moving.epidemics'].rename(columns=newcols, inplace=True)

    return pyepimemrslt, dropseasons
# This data frame contains the following columns:
# 
# - type: Tumor DNA profile (1=Aneuploid Tumor, 2=Diploid Tumor) 
# - time: Time to death or on-study time, weeks
# - delta Death indicator (0=alive, 1=dead)

# In[3]:

# Load in data
get_ipython().magic(u'R data(tongue)')
# Pull data into python kernel
get_ipython().magic(u'Rpull tongue')
# Convert into pandas dataframe
from rpy2.robjects import pandas2ri

tongue = pandas2ri.ri2py_dataframe(tongue)


# We can now refer to `tongue` using both R and python.

# In[4]:

get_ipython().run_cell_magic(u'R', u'', u'summary(tongue)')


# In[5]:

tongue.describe()


# We can even operate on R and Python within the same code cell.