Beispiel #1
0
 def __init__(self,
              count_matrix,
              design_matrix,
              design_formula,
              gene_column='gene_id'):
     print("you need to have R installed with the DESeq2 library installed")
     try:
         assert gene_column == count_matrix.columns[
             0], 'no $gene_column name in 1st column\'s name'
         gene_id = count_matrix[gene_column]
     except AttributeError:
         sys.exit('Wrong Pandas dataframe?')
     print(rpy2.__version__)
     self.deseq_result = None
     self.resLFC = None
     self.comparison = None
     self.normalized_count_matrix = None
     self.gene_column = gene_column
     self.gene_id = count_matrix[self.gene_column]
     with localconverter(ro.default_converter + pandas2ri.converter):
         self.count_matrix = pandas2ri.py2rpy(
             count_matrix.drop(gene_column, axis=1).astype(int))
         self.design_matrix = pandas2ri.py2rpy(design_matrix.astype(bool))
     self.design_formula = Formula(design_formula)
     self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix,
                                             colData=self.design_matrix,
                                             design=self.design_formula)
Beispiel #2
0
def py2rpy_anndata(obj: AnnData) -> RS4:
    with localconverter(default_converter):
        s4v = importr("S4Vectors")
        sce = importr("SingleCellExperiment")
        # TODO: sparse
        x = {} if obj.X is None else dict(X=mat_converter.py2rpy(obj.X.T))
        layers = {k: mat_converter.py2rpy(v.T) for k, v in obj.layers.items()}
        assays = ListVector({**x, **layers})

        row_args = {k: pandas2ri.py2rpy(v) for k, v in obj.var.items()}
        if check_no_dupes(obj.var_names, "var_names"):
            row_args["row.names"] = pandas2ri.py2rpy(obj.var_names)
        row_data = s4v.DataFrame(**row_args)

        col_args = {k: pandas2ri.py2rpy(v) for k, v in obj.obs.items()}
        if check_no_dupes(obj.obs_names, "obs_names"):
            col_args["row.names"] = pandas2ri.py2rpy(obj.obs_names)
        col_data = s4v.DataFrame(**col_args)

        # Convert everything we know
        with localconverter(full_converter() + dict_converter):
            metadata = ListVector(obj.uns.items())

        rd_args = {conv_name.scanpy2sce(k): mat_converter.py2rpy(obj.obsm[k]) for k in obj.obsm.keys()}
        reduced_dims = s4v.SimpleList(**rd_args)

        return sce.SingleCellExperiment(
            assays=assays, rowData=row_data, colData=col_data, metadata=metadata, reducedDims=reduced_dims
        )
 def __init__(self,
              count_matrix,
              design_matrix,
              conditions,
              gene_column='id'):
     self.dds = None
     self.deseq_result = None
     self.resLFC = None
     self.comparison = None
     self.normalized_count_matrix = None
     self.gene_column = gene_column
     self.gene_id = count_matrix[self.gene_column]
     self.count_matrix = pandas2ri.py2rpy(
         count_matrix.drop(gene_column, axis=1))
     design_formula = "~ "
     for col in conditions:
         levels = design_matrix[col].unique()
         levels = robjects._convert_rpy2py_strvector(levels)
         as_factor = r["as.factor"]
         design_matrix[col] = FactorVector(design_matrix[col],
                                           levels=levels)
         design_matrix[col] = as_factor(design_matrix[col])
         design_formula = design_formula + col + " +"
     design_formula = design_formula[:-2]
     self.design_matrix = pandas2ri.py2rpy(design_matrix)
     self.design_formula = Formula(design_formula)
Beispiel #4
0
    def _infer_network(self, data):
        """
        Infer the network.

        Args:
            data (pd.DataFrame): data to be used for the inference.
        """
        # activate implicit conversion from pandas to R objects
        pandas2ri.activate()
        genie3 = importr('GENIE3')
        importr('foreach')
        importr('doParallel')
        # transform pandas dataframe into GENIE3 input format
        # via first automatic conversion to data.frame from pd.DataFrame
        # to matrix with `as.matrix` to preserve colnames and rownames
        expr_matrix = as_matrix(pandas2ri.py2rpy(data.T))
        # run GENIE3
        values = genie3.GENIE3(
                expr_matrix, self.regulators, self.targets, self.tree_method,
                self.k, self.n_trees, self.n_cores, self.verbose
            )
        weight_matrix = pd.DataFrame(
            values, columns=data.columns, index=data.columns
        )
        self.graph = Graph(adjacency=weight_matrix)
        logger.debug('inferred with {}'.format(self.method))
Beispiel #5
0
 def fit(self, dfx: pd.DataFrame, outcome_col, covariate_cols,
         teacher_id_col, **argv):
     covariate_cols_except_fixed = [
         x for x in covariate_cols if x not in self.fixed_effect_cols
     ]
     fixed_effect_cols_plus_tid = [teacher_id_col] + self.fixed_effect_cols
     dropna_subset_cols = [outcome_col
                           ] + covariate_cols + fixed_effect_cols_plus_tid
     formula = create_felm_formula(outcome_col, covariate_cols_except_fixed,
                                   fixed_effect_cols_plus_tid,
                                   self.factor_cols)
     pandas2ri.activate()
     df_use = dfx.dropna(subset=dropna_subset_cols)
     _res1 = self.r.assign("r_df", pandas2ri.py2rpy(df_use))
     _res2 = self.r(
         "res <- lfe::felm({formula}, r_df)".format(formula=formula))
     bb = self.r("lfe::getfe(res)")
     self.effect = bb
     self.residuals_without_fixed = pd.Series(index=dfx.index)
     self.residuals_without_fixed.loc[df_use.index, ] = self.r(
         "res$r.residuals")[:, 0]
     self.residuals_with_fixed = pd.Series(index=dfx.index)
     self.residuals_with_fixed.loc[df_use.index, ] = self.r(
         "res$residuals")[:, 0]
     pandas2ri.deactivate()
Beispiel #6
0
def fitdist(data: pd.Series, **kwargs):
    """fitdist
    See:
        https://cran.r-project.org/web/packages/fitdistrplus/fitdistrplus.pdf
    """
    rdf = pandas2ri.py2rpy(data)
    return fitdistrplus.fitdist(rdf, **kwargs)
    def __init__(self, count_matrix, design_matrix, design_formula):

        self.dds = None
        self.deseq_result = None
        self.resLFC = None
        self.comparison = None
        self.normalized_count_df = None
        # self.gene_column = self.count_matrix.index
        self.gene_id = count_matrix.index
        self.samplenames = count_matrix.columns
        self.count_matrix = pandas2ri.py2rpy(count_matrix)
        self.design_matrix = pandas2ri.py2rpy(design_matrix)
        self.design_formula = Formula(design_formula)
        self.dds = deseq.DESeqDataSetFromMatrix(countData=self.count_matrix,
                                                colData=self.design_matrix,
                                                design=self.design_formula)
Beispiel #8
0
def simpleNetworkx(G):

    ro.r('src = c()')
    ro.r('target =c()')
    ro.r('rdf=data.frame()')

    df = p.DataFrame(data=G.edges())

    df_r = pandas2ri.py2rpy(df)

    ro.globalenv['src'] = df_r[0]
    ro.globalenv['target'] = df_r[1]

    ro.r('rdf=data.frame(src,target)')

    utils = importr('utils')
    utils.chooseCRANmirror(ind=1)

    try:
        networkD3 = importr('networkD3')
    except:
        utils.install_packages('networkD3')
        networkD3 = importr('networkD3')

    try:
        magrittr = importr('magrittr')
    except:
        utils.install_packages('magrittr')
        magrittr = importr('magrittr')

    ro.r('''simpleNetwork(rdf) %>% saveNetwork(file = 'Net.html')''')
    return None
Beispiel #9
0
    def fit(
        self,
        x: Optional[np.ndarray] = None,
        y: Optional[np.ndarray] = None,
        w: Optional[np.ndarray] = None,
        **kwargs,
    ) -> "GamMGCVModel":
        """
        Fit the model.

        Params
        ------
        x
            Independent variables.
        y
            Dependent variables.
        w
            Weights of :paramref:`x`.
        kwargs
            Keyword arguments.

        Returns
        -------
        :class:`cellrank.ul.models.GamMGCVModel`
            Return fitted self.
        """

        from rpy2 import robjects
        from rpy2.robjects import pandas2ri, Formula
        from rpy2.robjects.packages import importr

        super().fit(x, y, w, **kwargs)

        use_ixs = np.where(self.w > 0)[0]
        self._x = self.x[use_ixs]
        self._y = self.y[use_ixs]
        self._w = self.w[use_ixs]

        n_splines = kwargs.pop("n_splines", self._n_splines)

        mgcv = importr("mgcv")
        pandas2ri.activate()

        df = pandas2ri.py2rpy(
            pd.DataFrame(np.c_[self.x, self.y][use_ixs, :], columns=["x",
                                                                     "y"]))
        self._model = mgcv.gam(
            Formula(f'y ~ s(x, k={n_splines}, bs="cr")'),
            data=df,
            sp=self._sp,
            family=robjects.r.gaussian,
            weights=pd.Series(self.w[use_ixs]),
        )

        pandas2ri.deactivate()

        return self
Beispiel #10
0
def generate_args(n_args=256, max_rows=100, lang="py"):
    """This will create multiple dataframes (n_args) based on a template (TEMPLATE_PATH)"""
    args = []
    df_template = pd.read_csv(TEMPLATE_PATH)
    for n in range(n_args):
        new_df = construct_df(df_template, max_rows)
        if lang == "r":
            new_df = pandas2ri.py2rpy(new_df)
        args.append(new_df)
    return args
Beispiel #11
0
def _gam_fit_predict(x, y, weights=None, pred_x=None):

    import rpy2.robjects as robjects
    from rpy2.robjects import pandas2ri, Formula
    from rpy2.robjects.packages import importr

    pandas2ri.activate()

    # Weights
    if weights is None:
        weights = np.repeat(1.0, len(x))

    # Construct dataframe
    use_inds = np.where(weights > 0)[0]
    r_df = pandas2ri.py2rpy(
        pd.DataFrame(np.array([x, y]).T[use_inds, :], columns=["x", "y"]))

    # Fit the model
    rgam = importr("gam")
    model = rgam.gam(Formula("y~s(x)"),
                     data=r_df,
                     weights=pd.Series(weights[use_inds]))

    # Predictions
    if pred_x is None:
        pred_x = x
    y_pred = np.array(
        robjects.r.predict(model,
                           newdata=pandas2ri.py2rpy(
                               pd.DataFrame(pred_x, columns=["x"]))))

    # Standard deviations
    p = np.array(
        robjects.r.predict(model,
                           newdata=pandas2ri.py2rpy(
                               pd.DataFrame(x[use_inds], columns=["x"]))))
    n = len(use_inds)
    sigma = np.sqrt(((y[use_inds] - p)**2).sum() / (n - 2))
    stds = (np.sqrt(1 + 1 / n + (pred_x - np.mean(x))**2 /
                    ((x - np.mean(x))**2).sum()) * sigma / 2)

    return y_pred, stds
Beispiel #12
0
def art_2by2(df: pd.DataFrame, feature: str, group: str):
    feature_wide = df[["Unnamed: 0", feature, "all", "group"]]
    feature_long = feature_wide.melt(id_vars=["Unnamed: 0", "group"], value_vars=[feature, "all"])
    feature_long = feature_long.query(f"group in ('wt', '{group}')")
    feature_long.loc[feature_long["value"] < 0, "value"] = 0
    feature_long["group"] = feature_long["group"].astype("category")
    feature_long["variable"] = feature_long["variable"].astype("category")
    r_df = pandas2ri.py2rpy(feature_long)
    model = artool.art(robj.Formula("value ~ group * variable"), data=r_df)
    anova = robj.r["anova"]
    return anova(model)
Beispiel #13
0
def query_log_source(source, time_filter, time_column):
    cutoff = f"DATEADD(day, -{time_filter}, CURRENT_TIMESTAMP())"
    query = f"SELECT * FROM {source} WHERE {time_column} > {cutoff};"
    try:
        data = list(db.fetch(query))
    except Exception as e:
        log.error("Failed to query log source: ", e)
    f = pack(data)
    frame = pandas.DataFrame(f)
    pandas2ri.activate()
    r_dataframe = pandas2ri.py2rpy(frame)
    return r_dataframe
Beispiel #14
0
def DESeq2(count_matrix, design_matrix, normalize, cores=1):
    # gene_column = ''
    to_dataframe = ro.r('function(x) data.frame(x)')
    count_matrix = round(count_matrix)
    count_matrix = pandas2ri.py2rpy(count_matrix)
    design_matrix = pandas2ri.py2rpy(design_matrix)
    design_formula = Formula(' ~ 1')

    dds0 = deseq.DESeqDataSetFromMatrix(countData=count_matrix,
                                        colData=design_matrix,
                                        design=design_formula)
    dds0 = BiocGenerics.estimateSizeFactors(dds0, type="poscounts")
    order_size_factor = list(dds0.do_slot('colData').do_slot('rownames'))
    if normalize is not None:
        logging.info("Enforcing custom normalisation in DESeq2")
        dds0.do_slot('colData').do_slot(
            'listData')[1] = ro.vectors.FloatVector(
                list(normalize.loc[
                    order_size_factor,
                    'libsize_75percent']))  # Enforce size factors
    else:
        logging.info("WARNING: default size factor of DESeq2 are used")
    dds = deseq.DESeq(
        dds0,
        parallel=True,
        BPPARAM=BiocParallel.MulticoreParam(cores),
        sfType=
        "poscounts",  # Will run 1. estimation of size factors: estimateSizeFactors # parameter "poscounts"
        fitType=
        "parametric"  # 2. estimation of dispersion: estimateDispersions # parameter "parametric"
    )

    deseq_result = deseq.results(dds)
    fit_res = to_dataframe(deseq_result)
    disp = to_dataframe(deseq.dispersions(dds)).rename({'x': 'dispersion'},
                                                       axis=1)
    disp.index = fit_res.index
    fit_res = pd.concat([fit_res['baseMean'], disp], axis=1)
    return fit_res
Beispiel #15
0
def main():
    import pandas as pd
    from rpy2.robjects import pandas2ri
    import rpy2.robjects as ro
    from teacher_va.estimate import TeacherValueAddedEstimator, StudentDataFrame

    def give_group_name(dfx, keys, group_name_col='name'):
        aa = dfx[keys].drop_duplicates().dropna()
        aa[group_name_col] = 1
        aa[group_name_col] = aa[group_name_col].cumsum()
        return (dfx.merge(aa, on=keys, how='left'))

    pd.set_option("display.max_columns", 101)
    df = (
        pd.read_csv('data/math_teacher.csv').pipe(
            give_group_name,
            keys=[
                'year_prime', 'school_id_prime', 'grade_prime', 'class_prime'
            ],
            group_name_col='name')[[
                'mst_id', 'name', 'math_level_prime', 'math_level',
                'teacher_id', 'year_prime'
            ]].dropna(subset=['math_level_prime', 'math_level', 'year_prime'])
        # .pipe(lambda dfx: pd.get_dummies(dfx, columns=['mst_id'], sparse=True, prefix='mstid'))
    )

    # printできないような出力をコールするとしくるから要注意
    pandas2ri.activate()
    r_df = ro.r.assign("r_df", pandas2ri.py2rpy(df))
    aa = ro.r(
        "res <- lfe::felm(math_level ~ math_level_prime | as.factor(mst_id) |0  |0, r_df)"
    )
    bb = ro.r("res$residuals")

    sdf = (
        StudentDataFrame.get_student_dataframe(
            data=df,
            covariate_cols=['math_level_prime'],
            outcome_col='math_level',
            class_name_col='name',  # 1 teacher: 1 class
            time_col='year_prime',
            teacher_id_col='teacher_id',
        ))
    sdf.fillna_teacher_id_from_class_cols()
    tvtva = TeacherValueAddedEstimator(effect_type='time_fixed')
    tvtva.fit(sdf=sdf, is_custom_predict=True, custom_resid=bb)
    teacher_effect = tvtva.teacher_effect
    tvtva = TeacherValueAddedEstimator(effect_type='time_varing')
    tvtva.fit(sdf=sdf, is_custom_predict=True, custom_resid=bb)
    teacher_effect2 = tvtva.teacher_effect
    """
Beispiel #16
0
def to_trajr(trj):
    """Convert trajectory to R `trajr` object. Default fps is 30.

    Args:
        trajectory (:class:`~traja.TrajaDataFrame`): trajectory


    Returns:
        traj (:class:`rpy2.robjects.vectors.DataFrame`): column names are ['x', 'y', 'time', 'displacementTime',
                                                            'polar', 'displacement']

    .. doctest::

        >>> import traja; from traja import rutils
        >>> df = traja.TrajaDataFrame({'x':range(5),'y':range(5)})
        >>> trjr_df = rutils.to_trajr(df) # doctest: +SKIP
        >>> [x for x in trjr_df.names] # doctest: +SKIP
        ...
        ['x', 'y', 'id', 'time', 'displacementTime', 'polar', 'displacement']


    """
    from traja.trajectory import _get_time_col

    trajr = import_trajr()
    if "id" not in trj.__dict__.keys():
        trj["id"] = 0
    time_col = _get_time_col(trj)
    if time_col == "index":
        trj["time"] = trj.index
        time_col = "time"
    fps = trj.fps
    spatial_units = trj.spatial_units or "m"
    time_units = trj.time_units or "s"

    trj_rdf = rpandas.py2rpy(trj)

    trajr_trj = trajr.TrajFromCoords(
        trj_rdf,
        xCol="x",
        yCol="y",
        timeCol=time_col or rpy2.rinterface.NULL,
        fps=fps or 30,
        spatialUnits=spatial_units,
        timeUnits=time_units,
    )

    return trajr_trj
Beispiel #17
0
def run(execute, globalenv=None, **kwargs):

    ## search inside analysis folder
    home = os.path.realpath(__file__)

    ## check if the command is a python file

    f = os.path.dirname(home) + '/' + execute + '.py'

    if os.path.isfile(f):
        execute = f

    if os.path.isfile(execute) and execute.endswith('.py'):
        module_spec = importlib.util.spec_from_file_location(
            'plugin_module', execute)
        module = importlib.util.module_from_spec(module_spec)
        module_spec.loader.exec_module(module)
        return module.main(**kwargs)

    ## assume script is R

    if globalenv:
        rpy2.robjects.globalenv = globalenv

    for name, value in kwargs.items():

        ## for debug conversion errors
        # print name
        # print type( value )

        if isinstance(value, dict):
            ## use pandas
            value = pandas.DataFrame.from_dict(value)
            rpy2.robjects.globalenv[name] = pandas2ri.py2rpy(value)
        else:
            rpy2.robjects.globalenv[name] = converter.py2rpy(value)

    f = os.path.dirname(home) + '/' + execute + '.r'

    if os.path.isfile(f):
        execute = open(f).read()

    if os.path.isfile(execute):
        execute = open(execute).read()

    robjects.r(execute)

    return robjects.r  ## return all computed things
Beispiel #18
0
def KM(OS, Censored, as_group, data, ggsave=False, path="./", pvalue=0):
    # 分组
    surv_data = data[[OS, Censored, as_group]].sort_values(by=[as_group])
    surv_data["group"] = "L"
    surv_data.iloc[int(surv_data.shape[0] / 2):]["group"] = "H"
    # Kaplan-Meier生存曲线
    with localconverter(ro.default_converter + pandas2ri.converter):
        robjects.globalenv["surv_data"] = pandas2ri.py2rpy(surv_data)
    robjects.globalenv["surv_diff"] = r(
        f"survdiff(Surv({OS}, {Censored})~group,surv_data,rho = 0)")
    Pvalue = r("1 - pchisq(surv_diff$chisq, length(surv_diff$n) -1)")[0]
    if ggsave and Pvalue < pvalue:
        r.ggsave(r(
            f"autoplot(survfit(Surv({OS}, {Censored})~group,surv_data), xlab = 'Time', ylab = 'Survival')+ggtitle('Pvalue = {Pvalue}')"
        ),
                 file=f"{path}/{gene}.pdf")
    return Pvalue
Beispiel #19
0
def estimation_fixed_effect(outcome_col, time_col, teacher_id_col,
                            class_name_col, covariate_cols, fixed_effect_cols,
                            **argv):
    def create_formula(target, covariate_cols, fixed_effect_cols):
        templete = '{target} ~ {covariate_str} | {fixed_str} | 0 | 0 '
        covariate_str = get_add_str_from_str_list(
            covariate_cols) if len(covariate_cols) > 0 else ' 0 '
        fixed_str = get_add_str_from_str_list(
            fixed_effect_cols) if len(fixed_effect_cols) > 0 else ' 0 '
        return templete.format(target=target,
                               covariate_str=covariate_str,
                               fixed_str=fixed_str)

    use_cols = [outcome_col, time_col, teacher_id_col, class_name_col
                ] + covariate_cols + fixed_effect_cols
    dropna_subset_cols = [outcome_col, time_col, class_name_col
                          ] + covariate_cols + fixed_effect_cols
    fixed_effect_cols_plus_tid = [teacher_id_col] + fixed_effect_cols
    formula = create_formula(outcome_col, covariate_cols,
                             fixed_effect_cols_plus_tid)
    # start
    pd.set_option("display.max_columns", 101)
    df_res = (
        pd.read_csv('./notebook/toda_teacher/df.csv')
        # 小学校だけで推定
        .pipe(lambda dfx: dfx.loc[dfx['year_prime'] >= 2015]).pipe(
            lambda dfx: dfx.loc[dfx['school_id_prime'] < 30000])
        [use_cols].dropna(subset=dropna_subset_cols)
        # .pipe(lambda dfx: pd.get_dummies(dfx, columns=['mst_id'], sparse=True, prefix='mstid'))
    )

    pandas2ri.activate()
    _res1 = ro.r.assign("r_df", pandas2ri.py2rpy(df_res))
    _res2 = ro.r("res <- lfe::felm({formula}, r_df)".format(formula=formula))
    bb = ro.r("lfe::getfe(res)")
    pandas2ri.deactivate()

    effect = (bb.reset_index().pipe(lambda dfx: dfx.loc[dfx[
        'fe'] == teacher_id_col, ['index', 'effect']]).assign(
            **{
                teacher_id_col:
                lambda dfx: dfx['index'].str.extract('{0}\.(.+)'.format(
                    teacher_id_col)).astype(df_res[teacher_id_col].dtype)
            })[[teacher_id_col, 'effect']].rename(columns={'effect': 'tva'}))
    return effect
Beispiel #20
0
def list_to_vector(l):

    if isinstance(l, types.GeneratorType):
        l = list(l)
    if isinstance(l, map):
        l = list(l)

    if len(l) == 0:
        return rpy2.rinterface.NA_Real

    if isinstance(l[0], str):
        return rpy2.rinterface.StrSexpVector(l)
    if isinstance(l[0], int):
        return rpy2.rinterface.IntSexpVector(l)
    if isinstance(l[0], float):
        return rpy2.rinterface.FloatSexpVector(l)
    if isinstance(l[0], bool):
        return rpy2.rinterface.BoolSexpVector(l)

    if isinstance(l[0], dict):  ## need to convert to data frame

        ## let's hope the keys are always the same for each of things in the list
        keys = l[0].keys()

        ## init new dict where values are collected
        dataframe = {}

        for key in keys:
            dataframe[key] = []

        for row in l:
            for key in keys:
                if key not in row:
                    value = None
                else:
                    value = row[key]

                dataframe[key].append(value)

        dataframe = pandas.DataFrame.from_dict(dataframe)
        return pandas2ri.py2rpy(dataframe)

    ## default to NA just in case
    return rpy2.rinterface.NA_Real
Beispiel #21
0
    def predict(self,
                x_test: Optional[np.ndarray] = None,
                key_added: str = "_x_test",
                **kwargs) -> np.ndarray:
        """
        Run the prediction.

        Params
        ------
        x_test
            Features used for prediction.
        key_added
            Attribute name where to save the independent variables.
            If `None`, don't save them.
        kwargs
            Keyword arguments.

        Returns
        -------
        :class:`numpy.ndarray`
            The predicted values.
        """

        from rpy2 import robjects
        from rpy2.robjects import pandas2ri

        if self.model is None:
            raise RuntimeError(
                "Trying to call an uninitialized model. To initialize it, run `.fit()` first."
            )
        self._check(key_added, x_test)

        pandas2ri.activate()
        self._y_test = (np.array(
            robjects.r.predict(
                self.model,
                newdata=pandas2ri.py2rpy(
                    pd.DataFrame(self.x_test, columns=["x"])),
            )).squeeze().astype(self._dtype))
        pandas2ri.deactivate()

        return self.y_test
def linear_model(data, Input, Output, Condition):
    try:
        stats = importr('stats')
        base = importr('base')
        pandas2ri.activate()
        r_df = pandas2ri.py2rpy(data)
        pandas2ri.deactivate()
        formula = '{y}~{x}*{condition}'.format(y=Output,
                                               x=Input,
                                               condition=Condition)
        lm = stats.lm(formula, r_df)
        summary = (base.summary(lm))
        results = summary.rx2('coefficients')
        results_df = base.as_data_frame_matrix(results)
        py_results_df = pd.DataFrame(results_df).transpose()
        py_results_df.columns = results_df.colnames
        py_results_df.index = results_df.rownames
        return (py_results_df)
    except:
        return (pd.DataFrame({}))
def dml_iivm_pyvsr_fixture(generate_data_iivm, idx, score, dml_procedure):
    boot_methods = ['normal']
    n_folds = 2

    # collect data
    data = generate_data_iivm[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & gg
    learner_classif = LogisticRegression(penalty='none', solver='newton-cg')
    learner_reg = LinearRegression()
    ml_g = clone(learner_reg)
    ml_m = clone(learner_classif)
    ml_r = clone(learner_classif)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols, 'z')
    dml_iivm_obj = dml.DoubleMLIIVM(obj_dml_data,
                                    ml_g,
                                    ml_m,
                                    ml_r,
                                    n_folds,
                                    dml_procedure=dml_procedure)

    np.random.seed(3141)
    dml_iivm_obj.fit()

    # fit the DML model in R
    all_train, all_test = export_smpl_split_to_r(dml_iivm_obj.smpls[0])

    r_dataframe = pandas2ri.py2rpy(data)
    res_r = r_IIVM(r_dataframe, score, dml_procedure, all_train, all_test)

    res_dict = {
        'coef_py': dml_iivm_obj.coef,
        'coef_r': res_r[0],
        'se_py': dml_iivm_obj.se,
        'se_r': res_r[1]
    }

    return res_dict
Beispiel #24
0
def dml_irm_pyvsr_fixture(generate_data_irm, idx, score, dml_procedure):
    n_folds = 2

    # collect data
    (X, y, d) = generate_data_irm[idx]
    x_cols = [f'X{i + 1}' for i in np.arange(X.shape[1])]
    data = pd.DataFrame(np.column_stack((X, y, d)),
                        columns=x_cols + ['y', 'd'])

    # Set machine learning methods for m & g
    learner_classif = LogisticRegression(penalty='none', solver='newton-cg')
    learner_reg = LinearRegression()
    ml_g = clone(learner_reg)
    ml_m = clone(learner_classif)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], x_cols)
    dml_irm_obj = dml.DoubleMLIRM(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    np.random.seed(3141)
    dml_irm_obj.fit()

    # fit the DML model in R
    all_train, all_test = export_smpl_split_to_r(dml_irm_obj.smpls[0])

    r_dataframe = pandas2ri.py2rpy(data)
    res_r = r_IRM(r_dataframe, score, dml_procedure, all_train, all_test)

    res_dict = {
        'coef_py': dml_irm_obj.coef,
        'coef_r': res_r[0],
        'se_py': dml_irm_obj.se,
        'se_r': res_r[1]
    }

    return res_dict
Beispiel #25
0
def gsea_metrics(source, target, file, meta_file):
    """
    Calculates GSEA score for validation on CFM

    Args:
        source (str): Source cell type
        target (str): Target cell type
        file (str): Path to the file with TopoCMap results table
        meta_file (str): Path to the file with drugs metadata

    Returns:
        float :Normalized enrichment score

    """
    drug_meta = pd.read_csv(meta_file)
    df = pd.read_csv(file)
    df = df.drop_duplicates()
    print(type(df))
    cids_cur = stand_chems(source, target)
    pert_cur = []
    for ind, chem in enumerate(drug_meta['pubchem_cid']):
        for chem_1 in cids_cur:
            try:
                if int(chem) == int(chem_1):
                    pert_cur.append(drug_meta['pert_id'].loc[ind])
            except ValueError:
                continue
    # Defining the R script and loading the instance in Python
    r = robjects.r
    r['source']('~/Downloads/fgsea-tutorial.R')
    # Loading the function we have defined in R.
    filter_country_function_r = robjects.globalenv['fgsea_analysis']
    # converting it into r object for passring into r function
    df_r = pandas2ri.py2rpy(df)
    pert_cur_r = robjects.vectors.FactorVector(pert_cur)
    # Invoking the R function and getting the result
    df_result_r = filter_country_function_r(df_r, pert_cur_r)
    print(df_result_r)
    # Converting it back to a pandas dataframe.
    return df_result_r["NES"]
def auto_arima(endog, exog=None, freq=None):
    if freq is None:
        freq = 1
    # endog_r = r.ts(pandas2ri.py2ri(endog), freq=freq)
    # if using more recent version of rpy2, py2ri was renamed to py2rpy
    # see reference: https://stackoverflow.com/questions/55990529/module-rpy2-robjects-pandas2ri-has-no-attribute-ri2py
    endog_r = r.ts(pandas2ri.py2rpy(endog), freq=freq)
    autoarima_args = {
        "seasonal": True,
        "stationary": False,
        "trace": True,
        "max.order": 20,
        "max.p": 20,
        "max.q": 20,
        "max.P": 20,
        "max.Q": 20,
        "max.D": 20,
        "max.d": 20,
        "start.p": 1,
        "start.q": 1,
        "start.P": 1,
        "start.Q": 1
    }
    if exog is not None:
        # add noise to avoid rank-deficient error for exog
        scale = np.std(exog.values)
        z = scale * 1e-4 * np.random.randn(*exog.shape)
        exog_r = r.matrix(exog.values + z,
                          nrow=exog.shape[0],
                          ncol=exog.shape[1],
                          dimnames=[[], exog.columns.tolist()])
        fit_r = forecast.auto_arima(y=endog_r, xreg=exog_r, **autoarima_args)
    else:
        fit_r = forecast.auto_arima(y=endog_r, **autoarima_args)
    fit_dict = dict(fit_r.items())
    # for proof of this order see last comment:
    # https://stats.stackexchange.com/questions/178577/how-to-read-p-d-and-q-of-auto-arima
    p, q, P, Q, s, d, D = list(fit_dict["arma"])
    return (p, d, q), (P, D, Q, s)
Beispiel #27
0
    def predict(self,
                x_test: Optional[np.ndarray] = None,
                key_added: str = "_x_test",
                **kwargs) -> np.ndarray:
        """
        %(base_model_predict.full_desc)s

        Parameters
        ----------
        %(base_model_predict.parameters)s

        Returns
        -------
        %(base_model_predict.returns)s
        """  # noqa

        from rpy2 import robjects
        from rpy2.robjects import pandas2ri

        if self.model is None:
            raise RuntimeError(
                "Trying to call an uninitialized model. To initialize it, run `.fit()` first."
            )
        if self._lib is None:
            raise RuntimeError(
                f"Unable to fit the model, R package `{self._lib_name!r}` is not imported."
            )

        x_test = self._check(key_added, x_test)

        pandas2ri.activate()
        self._y_test = (np.array(
            robjects.r.predict(
                self.model,
                newdata=pandas2ri.py2rpy(pd.DataFrame(x_test, columns=["x"])),
            )).squeeze().astype(self._dtype))
        pandas2ri.deactivate()

        return self.y_test
def dml_plr_pyvsr_fixture(generate_data1, idx, score, dml_procedure):
    n_folds = 2
    n_rep_boot = 483

    # collect data
    data = generate_data1[idx]
    X_cols = data.columns[data.columns.str.startswith('X')].tolist()

    # Set machine learning methods for m & g
    learner = LinearRegression()
    ml_g = clone(learner)
    ml_m = clone(learner)

    obj_dml_data = dml.DoubleMLData(data, 'y', ['d'], X_cols)
    dml_plr_obj = dml.DoubleMLPLR(obj_dml_data,
                                  ml_g,
                                  ml_m,
                                  n_folds,
                                  score=score,
                                  dml_procedure=dml_procedure)

    #np.random.seed(3141)
    dml_plr_obj.fit()

    # fit the DML model in R
    all_train, all_test = export_smpl_split_to_r(dml_plr_obj.smpls[0])

    r_dataframe = pandas2ri.py2rpy(data)
    res_r = r_MLPLR(r_dataframe, score, dml_procedure, all_train, all_test)

    res_dict = {
        'coef_py': dml_plr_obj.coef,
        'coef_r': res_r[0],
        'se_py': dml_plr_obj.se,
        'se_r': res_r[1]
    }

    return res_dict
Beispiel #29
0
    def fit(
        self,
        x: Optional[np.ndarray] = None,
        y: Optional[np.ndarray] = None,
        w: Optional[np.ndarray] = None,
        **kwargs,
    ) -> "GamMGCVModel":

        from rpy2 import robjects
        from rpy2.robjects import pandas2ri, Formula
        from rpy2.robjects.packages import importr

        super().fit(x, y, w, **kwargs)

        use_ixs = np.where(self.w > 0)[0]
        self._x = self.x[use_ixs]
        self._y = self.y[use_ixs]
        self._w = self.w[use_ixs]

        n_splines = kwargs.pop("n_splines", self._n_splines)

        mgcv = importr("mgcv")
        pandas2ri.activate()

        df = pandas2ri.py2rpy(
            pd.DataFrame(np.c_[self.x, self.y][use_ixs, :], columns=["x",
                                                                     "y"]))
        self._model = mgcv.gam(
            Formula(f'y ~ s(x, k={n_splines}, bs="cr")'),
            data=df,
            sp=self._sp,
            family=robjects.r.gaussian,
            weights=pd.Series(self.w[use_ixs]),
        )

        pandas2ri.deactivate()

        return self
Beispiel #30
0
    def predict(self,
                x_test: Optional[np.ndarray] = None,
                key_added: str = "_x_test",
                **kwargs) -> np.ndarray:
        from rpy2 import robjects
        from rpy2.robjects import pandas2ri

        if self.model is None:
            raise RuntimeError(
                f"Trying to call an uninitialized model. To initialize it, run `.fit()` first."
            )
        self._check(key_added, x_test)

        pandas2ri.activate()
        self._y_test = (np.array(
            robjects.r.predict(
                self.model,
                newdata=pandas2ri.py2rpy(
                    pd.DataFrame(self.x_test, columns=["x"])),
            )).squeeze().astype(self._dtype))
        pandas2ri.deactivate()

        return self.y_test