Ejemplo n.º 1
0
def rpy2py_vector(v):
    """
    Converts vectors. Also handles NA in int vectors: https://github.com/rpy2/rpy2/issues/376
    """
    if not isinstance(v, Sexp):
        return v
    if isinstance(v, IntSexpVector):
        assert v._R_SIZEOF_ELT == 4, "R integer size changed away from 32 bit"
        if "factor" in v.rclass:
            r = pandas2ri.rpy2py(v)
        else:
            r = pd.array(v, dtype=pd.Int32Dtype())
        r[np.array(baseenv["is.na"](v), dtype=bool)] = pd.NA
        return r
    return pandas2ri.rpy2py(v)
Ejemplo n.º 2
0
    def data_frame_to_string(r_data_frame, add_rownames=False) -> str:
        """
        Convert an R data.frame to a string representation using rpy2.
        :param r_data_frame: The R data.frame object
        :param add_rownames: If set, rownames are added to the output
        :returns: The string representation in tab-delimited format.
        """
        # convert the R data.frame to numpy
        data_frame = pandas2ri.rpy2py(r_data_frame)

        # initialise the list of rows with the header and an optional empty first field
        if add_rownames:
            all_lines = ["\t" + "\t".join(list(data_frame.columns))]
        else:
            all_lines = ["\t".join(list(data_frame.columns))]

        # add each row
        for row in data_frame.iterrows():
            if add_rownames:
                this_line = [row[0]] + list(row[1])
            else:
                this_line = list(row[1])

            # convert to single string
            all_lines.append("\t".join([str(value) for value in this_line]))

        # join the lines
        complete_string = "\n".join(all_lines) + "\n"

        return complete_string
Ejemplo n.º 3
0
def pcor(var1, var2, covariate, method='spearman'):
    '''Run R ppcor's partial correlation

    Key arguments:
        var1, var2, covariate: float or int numpy array
        method: str, 'spearman' or 'pearson'

    '''
    # import ppcor library in R
    base = importr('ppcor')

    # define variables in R
    x = FloatVector(var1)
    y = FloatVector(var2)
    c = FloatVector(covariate)

    # assign values
    r.assign('x', x)
    r.assign('y', y)
    r.assign('c', c)

    # run partial correlation in R and return outputs to python
    r(f'pcorOut <- pcor.test(x, y, c, method = "{method}")')
    pcor_out = r('pcorOut')
    pcor_out_df = pandas2ri.rpy2py(pcor_out)

    return pcor_out_df
Ejemplo n.º 4
0
def limma_camera(matrix,
                 design,
                 weights,
                 limma_stats,
                 groups,
                 coef,
                 group_name='group'):

    assert set(matrix.index) == set(limma_stats.index)

    limma_group_stats = {}
    limma_empirical_stats = {}

    for group, index in groups.items():
        limma_subdf = limma_stats.loc[index]

        df_stats = limma_subdf.mean()
        df_stats['proteins'] = '/'.join(sorted(index))

        __, \
        empirical_median, empirical_median_left, empirical_median_right = empirical_ci(limma_subdf,
                                                                                       random_state=RANDOM_STATE)

        row = pd.Series(
            [empirical_median, empirical_median_left, empirical_median_right],
            index=[
                'empirical_median', 'empirical_median_ci_left',
                'empirical_median_ci_right'
            ])

        limma_group_stats[group] = df_stats
        limma_empirical_stats[group] = row

    limma_group_stats = pd.DataFrame(limma_group_stats).T
    limma_group_stats.index.name = group_name
    limma_group_stats.columns = [
        f'mean_{c}' for c in limma_group_stats.columns
    ]

    limma_empirical_stats = pd.DataFrame(limma_empirical_stats).T
    limma_empirical_stats.index.name = group_name

    r_groups = _to_r_listvector_of_string(groups)

    r_matrix, r_design, r_weights = to_r_matrix_design_and_weights(
        matrix, design, weights)

    r_camera_res = r_limma.camera(r_matrix,
                                  contrast=coef,
                                  index=r_groups,
                                  design=r_design,
                                  weights=r_as_matrix(r_weights),
                                  **{'use.ranks': False})

    camera_res = pandas2ri.rpy2py(r_camera_res)
    camera_res.index.name = group_name
    camera_res = camera_res.join(limma_group_stats).join(limma_empirical_stats)

    return camera_res
Ejemplo n.º 5
0
def calc_knn_sale(user, password, host, database, port):
    r = robjects.r
    r['source']('for_sale_KNN_dynamic_script.R')  #object of R file
    try:
        get_main_function_r = robjects.globalenv[
            'main_forsale_knn']  #loading R function to use
        df_result_r = get_main_function_r(user, password, host, database, port)
        df_result = pandas2ri.rpy2py(df_result_r)
    except Exception as e:
        print("KNN Sale Building model failed:", e)
        return None
    return df_result
Ejemplo n.º 6
0
def calc_neuralnet_census(user, password, host, database, port):
    r = robjects.r
    r['source']('nn_census_script.R')  #object of R file
    try:
        get_main_function_r = robjects.globalenv[
            'mainfunction.all']  #loading R function to use
        df_result_r = get_main_function_r(host, user, password, database, port)
        df_result = pandas2ri.rpy2py(df_result_r)
    except Exception as e:
        print("NeuralNet Census model failed:", e)
        return None
    return df_result
Ejemplo n.º 7
0
    def _varimax(self, factor_df, **kwargs):
        '''
        varimax rotation of factor matrix

        Args:
            factor_df: factor matrix as pd.DataFrame with shape
                       (# features, # principal components)

        Return:
            rot_factor_df: rotated factor matrix as pd.DataFrame
        '''
        factor_mtr = self._df2mtr(factor_df)
        varimax = robjects.r['varimax']
        rot_factor_mtr = varimax(factor_mtr)
        return pandas2ri.rpy2py(rot_factor_mtr.rx2('loadings'))
Ejemplo n.º 8
0
def load_data(filename):
    """
    Loads data for the models in RDS format from bnlearn.com

    Parameters
    ----------
    filename :
        RDS file path, e.g. './asia.rds'

    Returns
    -------
    data : 
        RDS data object
    """
    readRDS = robjects.r['readRDS']
    data = readRDS(filename)
    data = pandas2ri.rpy2py(data)
    return data
Ejemplo n.º 9
0
def fit_lme(formula,
            df,
            family='gaussian',
            optimizer='nloptwrap',
            random_effect=True,
            **fit_kwargs):
    f = Formula(formula)

    lme4 = importr('lme4')
    lmer = importr('lmerTest')  # overloads lmer function from lme4 package
    base = importr('base')
    stats = importr('stats')

    with localconverter(ro.default_converter + pandas2ri.converter):
        if family == 'gaussian':
            if random_effect:
                control = lme4.lmerControl(
                    **{
                        'calc.derivs':
                        True,
                        'check.rankX':
                        'silent.drop.cols',
                        'check.conv.singular':
                        r('lme4::.makeCC')(action="ignore", tol=1e-4)
                    })
                fit = lmer.lmer(f, df, control=control, **fit_kwargs)
            else:
                fit = stats.lm(f, df, **fit_kwargs)
        elif family in ('binomial', 'poisson'):
            if random_effect:
                if optimizer == 'nloptwrap':
                    control = lme4.glmerControl(
                        **{
                            'optimizer':
                            'nloptwrap',
                            'calc.derivs':
                            True,
                            'check.rankX':
                            'silent.drop.cols',
                            'check.conv.singular':
                            r('lme4::.makeCC')(action="ignore", tol=1e-4)
                        })
                else:
                    control = lme4.glmerControl(
                        **{
                            'check.rankX':
                            'silent.drop.cols',
                            'check.conv.singular':
                            r('lme4::.makeCC')(action="ignore", tol=1e-4)
                        })

                fit = lme4.glmer(f,
                                 df,
                                 control=control,
                                 family=family,
                                 **fit_kwargs)

            else:
                fit = stats.glm(f, df, family=family, **fit_kwargs)
        else:
            if random_effect:
                if optimizer == 'nloptwrap':
                    control = lme4.glmerControl(
                        **{
                            'optimizer':
                            'nloptwrap',
                            'calc.derivs':
                            True,
                            'check.rankX':
                            'silent.drop.cols',
                            'check.conv.singular':
                            r('lme4::.makeCC')(action="ignore", tol=1e-4)
                        })
                    fit = r('lme4::glmer.nb')(f, df, **{
                        'nb.control': control
                    }, **fit_kwargs)
                else:
                    fit = r('lme4::glmer.nb')(f, df, **fit_kwargs)
            else:
                fit = r('MASS::glm.nb')(f, df, **fit_kwargs)

        anova_df = stats.anova(fit)

    coef_df = r['as.data.frame'](stats.coef(base.summary(fit)))
    coef_df = pandas2ri.rpy2py(coef_df)

    return coef_df, anova_df
Ejemplo n.º 10
0
def call_fitter(
    site_inputs_training,
    y_training,
    site_inputs_validation,
    hprm,
):
    assert y_training.ndim == 1
    path_R_files = os.path.join(
        paths.outputs,
        'R_files/',
    )
    os.makedirs(
        path_R_files,
        exist_ok=True,
    )

    ### Data
    data_training = {
        **{
            simplify_inpt_name(inpt, trsfm, prm, location): site_inputs_training[inpt, trsfm, prm, location].values
            for inpt, trsfm, prm, location in site_inputs_training
        },
        'target': y_training.values,
    }

    data_validation = {
        simplify_inpt_name(inpt, trsfm, prm, location):
        site_inputs_validation[inpt, trsfm, prm, location].values
        for inpt, trsfm, prm, location in site_inputs_validation
    }

    # Convert arrays
    pandas2ri.activate()
    df_train = pandas2ri.py2rpy(pd.DataFrame.from_dict(data_training))
    df_test = pandas2ri.py2rpy(pd.DataFrame.from_dict(data_validation))
    pandas2ri.deactivate()

    # Save converted files
    r.assign("data_train", df_train)
    r("save(data_train, file='{0}/temp_dat_for_r_train.gzip', compress=TRUE)".
      format(path_R_files))
    r.assign("data_test", df_test)
    r("save(data_test,  file='{0}/temp_dat_for_r_test.gzip',  compress=TRUE)".
      format(path_R_files))

    nb_unique = {k: len(np.unique(v)) for k, v in site_inputs_training.items()}

    string_formula = make_gam_formula(
        site_inputs_training.columns,
        nb_unique,
        hprm,
    )

    ### Launch the R script
    path2script = os.path.join(
        os.path.dirname(__file__),
        'load_fit_predict_savePredictions.R',
    )
    args = [string_formula, path_R_files]
    cmd = ['Rscript', path2script] + args
    # Python will quote what must be quoted in subprocess.check_output

    print('launch Rscript')
    x = subprocess.check_output(cmd, universal_newlines=True)
    print(x)

    y_hat_training = r['read.table'](
        "{0}/predictions_from_r_train.gzip".format(path_R_files))
    y_hat_training = pandas2ri.rpy2py(y_hat_training)
    y_hat_training = y_hat_training.values

    y_hat_validation = r['read.table'](
        "{0}/predictions_from_r_test.gzip".format(path_R_files))
    y_hat_validation = pandas2ri.rpy2py(y_hat_validation)
    y_hat_validation = y_hat_validation.values

    return y_hat_training, y_hat_validation
Ejemplo n.º 11
0
import rpy2.robjects.pandas2ri as pandas2ri

pw_dmr_calls = pd.DataFrame(
    {"rds_path": dmr_calls_dir + "/" + pd.Series(os.listdir(dmr_calls_dir))})
pw_dmr_calls["pop"] = pw_dmr_calls["rds_path"].str.extract(
    r".*_dmrs_hsc_vs_([\w-]+)_0.01", expand=False)
pw_dmr_calls

gain_loss_counts = pd.DataFrame(-1,
                                columns=["Gain", "Loss"],
                                index=pw_dmr_calls["pop"])
for _unused, row_ser in pw_dmr_calls.iterrows():
    # chr start end  length   nCG  meanMethy1  meanMethy2 diff.Methy  areaStat
    gain_loss_for_pop = (np.sign(
        pandas2ri.rpy2py(base.readRDS(row_ser["rds_path"])).eval(
            "meanMethy2 - meanMethy1")).value_counts().sort_index().set_axis(
                ["Loss", "Gain"]))
    gain_loss_counts.loc[row_ser["pop"]] = gain_loss_for_pop
gain_loss_counts = gain_loss_counts.sort_values("Loss")

gain_loss_counts.head()

pw_counts_plot_df = (gain_loss_counts.stack().reset_index().set_axis(
    ["Population", "Direction", "No. of DMRs"], axis=1))
pw_counts_plot_df

fig, ax = plt.subplots(1,
                       1,
                       dpi=300,
                       constrained_layout=True,
                       figsize=(4 / 2.54, 3 / 2.54))
Ejemplo n.º 12
0
def to_dataframe(x):
    return pandas2ri.rpy2py(r_as_dataframe(x))