Example #1
0
def ancova(lm1, lm2,  names=('lm1', 'lm2')):
    """
    Compares the slopes and intercepts of two linear models.  Currently this is
    quite limited in that it only compares single-variable linear models that
    have `x` and `y` attributes.

    Returns (pval of slope difference, pval of intercept difference).

    Recall that if the slope is significant, you can't really say anything
    about the intercept.

    """
    # R code, from the extremely useful blog:
    # http://r-eco-evo.blogspot.com/2011/08/
    #           comparing-two-regression-slopes-by.html
    #
    # model1 = aov(y~x*factor, data=df)
    # (interaction term on summary(model1)'s 3rd table line)
    #
    # model2 = aov(y~x+factor, data=df)
    # (2nd table line for "factor" in summary(model2) is the sig of intercept
    # diff)
    #
    # anova(model1, model2)
    #  does removing the interaction term affect the model fit?

    # Construct variables suitable for ANOVA/ANCOVA
    label1 = [names[0] for i in lm1.x]
    label2 = [names[1] for i in lm2.x]
    labels = r.factor(np.array(label1 + label2))
    xi = np.concatenate((lm1.x, lm2.x))
    yi = np.concatenate((lm1.y, lm2.y))

    # The workflow is to populate the formula as a separate environment.
    # This first formula includes the interaction term
    fmla1 = robjects.Formula('yi~xi*labels')
    fmla1.environment['xi'] = xi
    fmla1.environment['yi'] = yi
    fmla1.environment['labels'] = labels
    result1 = r('aov(%s)' % fmla1.r_repr())
    interaction_pval = r.summary(result1)[0].rx2('Pr(>F)')[2]

    # No interaction term
    fmla2 = robjects.Formula('yi~xi+labels')
    fmla2.environment['xi'] = xi
    fmla2.environment['yi'] = yi
    fmla2.environment['labels'] = labels
    result2 = r('aov(%s)' % fmla2.r_repr())
    intercept_pval = r.summary(result2)[0].rx2('Pr(>F)')[1]

    # TODO: anova(result1, result2)?

    return interaction_pval, intercept_pval
Example #2
0
def through_the_origin(x, y):
    df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)})
    s = r.summary(r.lm('y ~ 0 + x', df))
    return {
        'coefficient': s.rx2('coefficients')[0],
        'stderr': s.rx2('coefficients')[1],
        'r.squared': s.rx2('r.squared')[0]
    }
Example #3
0
def dirichletreg_df(prop_df,
                    covar_df,
                    formula,
                    onevsrest_category=None,
                    return_reg_input=False):
    from rpy2.robjects import r, Formula
    from rpy2.robjects.packages import importr
    from rpy2.rinterface_lib.callbacks import logger as rpy2_logger

    dr = importr('DirichletReg')
    dr_df = pd.concat([prop_df, covar_df], axis=1)

    f = Formula(formula)

    rpy2_logger.setLevel(
        logging.ERROR)  # will display errors, but not warnings
    f.environment['y'] = dr.DR_data(py2r(prop_df))
    rpy2_logger.setLevel(
        logging.WARNING)  # will display errors, but not warnings

    if onevsrest_category is None:
        fit = dr.DirichReg(f, py2r(dr_df))
    else:
        assert onevsrest_category in prop_df.columns
        cat_index = prop_df.columns.tolist().index(onevsrest_category) + 1
        fit = dr.DirichReg(f,
                           py2r(dr_df),
                           model='alternative',
                           **{'sub.comp': cat_index})

    r.sink(file='/dev/null')
    u = r.summary(fit)
    r.sink()
    if r('sink.number')()[0] > 0:
        r.sink()

    if onevsrest_category is None:
        varnames = u.rx2('varnames')
    else:
        varnames = [onevsrest_category] * 2

    coef_mat = u.rx2('coef.mat')
    rows = r2py(r('rownames')(coef_mat))
    coef_df = r2py(r('as.data.frame')(coef_mat)).reset_index(drop=True)
    coef_df.columns = ['coefficient', 'se', 'zval', 'pval']

    coef_df['compartment'] = np.repeat(varnames, r2py(u.rx2('n.vars')))
    coef_df['variable'] = rows
    coef_df['significance'] = bin_pval(coef_df.pval)

    if onevsrest_category is not None:
        coef_df['coef_type'] = np.repeat(['mean', 'precision'],
                                         r2py(u.rx2('n.vars')))

    if return_reg_input:
        return dr_df, coef_df
    else:
        return coef_df
def clusterize_r_em(*args):
    """ Clustering and plotting with EM GMM"""
    try:
        from rpy2.robjects import r
        import rpy2.robjects.numpy2ri
        rpy2.robjects.numpy2ri.activate()
    except:
        print "You need rpy2"
        sys.exit(-1)

    r.library("mclust")
    for arg in args:
        model = r.Mclust(arg)
        print model
        print r.summary(model)
        r.quartz("plot")
        r.plot(model, arg)
        print raw_input("any key to pass")
def clusterize_r_em(*args):
    """ Clustering and plotting with EM GMM"""
    try:
        from rpy2.robjects import r
        import rpy2.robjects.numpy2ri
        rpy2.robjects.numpy2ri.activate()
    except:
        print "You need rpy2"
        sys.exit(-1)

    r.library("mclust")
    for arg in args:
        model = r.Mclust(arg)
        print model
        print r.summary(model)
        r.quartz("plot")
        r.plot(model, arg)
        print raw_input("any key to pass")
    def km_plot_data(self, name, time, censor, values):
        values_df = pd.DataFrame(
            {
                'time': time,
                'censor': censor,
                'value': values
            }, dtype=float)
        mean_value = values_df.value.mean()
        values_df['high'] = values_df.value >= mean_value

        data = {
            'time': robjects.FloatVector(values_df['time']),
            'censor': robjects.IntVector(values_df['censor']),
            'high': robjects.IntVector(values_df['high'])
        }
        df = robjects.DataFrame(data)

        # p value
        km_diff = self.surv.survdiff(
            robjects.Formula('Surv(time, censor) ~ high'), data=df)
        chisq_ind = list(km_diff.names).index('chisq')
        pvalue = chi2.sf(km_diff[chisq_ind][0], 1)

        km = self.surv.survfit(robjects.Formula('Surv(time, censor) ~ high'),
                               data=df)
        summary = pandas2ri.ri2py(r.summary(km, extend=True))
        r.assign('km', km)
        r.assign('times', data['time'])
        r.assign('res', r('summary(km, times=times)'))
        cols = r('lapply(c(2:6, 8:11), function(x) res[x])')
        r.assign('cols', cols)
        km_results = r('do.call(data.frame, cols)')
        km_results = pd.DataFrame(km_results)

        low_km = km_results[km_results['strata'] == 'high=0']
        high_km = km_results[km_results['strata'] == 'high=1']

        high_time, high_percent = self.make_plottable_kms(
            high_km['time'], high_km['surv'])
        low_time, low_percent = self.make_plottable_kms(
            low_km['time'], low_km['surv'])

        high = [{
            'percent': i[0],
            'time': i[1]
        } for i in zip(high_percent, high_time)]
        low = [{
            'percent': i[0],
            'time': i[1]
        } for i in zip(low_percent, low_time)]

        return {'high': high, 'low': low, 'p': float('%.4g' % pvalue)}
Example #7
0
def clusterize_r_em(*args, **kwargs):
    """ Clustering and plotting with EM GMM"""
    try:
        from rpy2.robjects import r
        import rpy2.robjects.numpy2ri
        rpy2.robjects.numpy2ri.activate()
        from sklearn.decomposition import PCA
    except:
        print "You need rpy2"
        sys.exit(-1)

    r.library("mclust")
    for arg in args:
        if kwargs.get('clf_on_pca', False):
            pca = PCA(2)
            arg = pca.fit(arg).transform(arg)
        model = r.Mclust(arg)
        print model
        print r.summary(model)
        r.quartz("plot")
        r.plot(model, arg)
        print raw_input("press any key to pass")
Example #8
0
 def _r_tobit(self, data, xvars, rbar):
     """ Estimate tobit with function from r """
     r.assign('data', com.convert_to_r_dataframe(data))
     rhs = '+'.join(xvars)
     model = r("vglm(OverallRank ~ "+ rhs +", \
                       family=tobit(Upper=" + str(rbar) + ", Lower=1), \
                       data=data, crit='coeff')")
     if self.opts['verbose']:
         print(r.summary(model))
     out = r.coef(model, matrix=True)
     out = np.array(out)
     index = deepcopy(xvars)
     index.insert(0, 'const')
     beta = pd.Series(out[:, 0], index=index)
     return {'beta': beta, 'sigma': out[0, 1]}
Example #9
0
def method_spline(rvar, train, test):
    """ B-splines with interaction """
    print("Splines")
    formula = rvar + ' ~ bs(OverallRank, df=6) + treat + '\
              'treat:bs(OverallRank, df=6) - 1'
    if rvar == 'Tuition':
        formula = formula + ' + year'
    model = r.lm(formula, data=train)
    #print(r.summary(model).rx2('coefficients'))
    print(r.summary(model).rx2('r.squared'))
    #print(r.summary(model))
    analytics(rvar, 'Training', train[rvar],
              np.array(r.predict(model)))
    if rvar != "UndergraduatemedianGPA":
        analytics(rvar, 'Testing', test[rvar],
                  np.array(r.predict(model, newdata=test)))
    print()
Example #10
0
def fit(data, outpath=None, verbosity=0, **kwargs):
    """estimates a multilevel model using the stanarm package in R.

    Todos:

        TODO: before converting data to r_data, filter out columns not appearing
            in formula.
    """
    r_data = pandas2ri.py2ri(data)
    kwargs['data'] = r_data
    kwargs['na.action'] = 'na.omit'
    fit = r.stan_glmer(**kwargs)  # TODO: select appropriate prior.
    if verbosity:
        print(fit)
        # print(fit.rx2('linear.predictors'))
        probs = np.array(fit.rx2('fitted.values'))
        preds = (probs > 0.5).astype(int)
        y = np.array(fit.rx2('y'))
        print('Number of observations: {0}'.format(y.shape[0]))
        print('Distribution of y:\n{0}'.format(np.bincount(y)))
        print('Classification report:')
        print(classification_report(y, preds))
        print('R2 Score:\n', r2_score(y, probs))
        print('Accuracy:\n', accuracy_score(y, preds))
        # r.X11()
        # r.plot(fit)
        # r.posterior_interval(fit, prob=0.95, pars='urban')
        # model evaluation
        # y_draws = r.posterior_predict(fit)
        # preds = r.predict(fit)
        # np.array(r['as.matrix'](fit.rx2('x'))).shape  # design matrix
        # np.array(r['as.matrix'](fit, pars='urban')).shape  # posterior parameter draws
    if verbosity > 1:
        print(r.summary(fit))
    if verbosity > 2:
        inspect(fit)
    if outpath is not None:
        outcome, _, _, _ = get_terms(fit.rx2('formula')[0])
        fname = 'mlm_{0}.rds'.format(outcome)
        save(fit, os.path.join(outpath, fname))
    return fit

def plot_forecast(data, fcast):
    index = pd.date_range(start=data.index.max(),
                          periods=len(fcast[3]) + 1,
                          freq='W')[1:]
    forecast = pd.Series(fcast[3], index=index)
    lowerpi = pd.Series(fcast[4], index=index)
    upperpi = pd.Series(fcast[5], index=index)
    plt.plot(data.index, data.casos_est, color='b', alpha=0.5)
    plt.plot(forecast.index, forecast.values, color='red')
    plt.fill_between(forecast.index,
                     lowerpi.values,
                     upperpi.values,
                     alpha=0.2,
                     color='red')


if __name__ == "__main__":
    data = get_alerta_table(3304557)  # Nova Iguaçu: 3303609
    tscount = importr('tscount')
    tsglm = r('tsglm')

    model = build_model(data)
    print(r.summary(model))
    r.plot(model)
    # fcast = forecast.forecast(model, h=5, level=95.0)
    # print(fcast[3], fcast[4], fcast[5])
    # plot_forecast(data=data, fcast=fcast)
    # plt.show()
Example #12
0
 def data_summary(self):
     print r.summary(self.rdata)
Example #13
0
 def summary(self, equation=None):
     print r.summary(self._estimate, equation=equation)
Example #14
0
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects import r as R
import pandas as pd

# Activating R environment
pandas2ri.activate()
#R = ro.r



# Creating a test DataFrame
data = {'a' : [1, 2, 3, 4, 5, 6, 7, 8, 9],
        'b' : [11, 12, 13, 14, 15, 16, 17, 18, 19],
        'c' : [21, 22, 23, 24, 25, 26, 26, 28, 29]        
}
 
test = pd.DataFrame(data)

print(test.head())

M = R.lm('a ~ b', data=test)


print(R.summary(M).rx2('coefficients'))



Example #15
0
    def __init__(self, formula, **kwargs):
        """
        Class for managing linear regression in R.

        Data are specified with the keyword arguments, which are passed to R's
        global environment.  They are first converted to NumPy arrays.

        For example, the kwarg `x=[1,2,3,4]` will add the list of four
        numbers to R's global env with the variable name `x`.  You can then
        access `x` from the formula.

        `formula` is a string passed verbatim to R's `lm()` function.

        Example usage::

            >>> x = [1, 2, 3, 4]
            >>> y = [1.2, 3, 7, 10]
            >>> m = LinearRegression(x=x, y=y, formula='y~x')
            >>> m.slope
            3.0399999999999996

            >>> m.intercept
            -2.299999999999998

            >>> m.adj_r_squared
            0.97221750212404412

            >>> m.slope_pval(0)
            0.0093041159117684229

            >>> m.intercept_pval(0)
            0.10459053583417365

            >>> # Variables accessible as NumPy arrays
            >>> m.x
            array([1, 2, 3, 4])

        Cross-check with scipy.stats.linregress::

            >>> from scipy.stats import linregress as scipy_linregress
            >>> results = scipy_linregress(x, y)
            >>> eps = 1e-15
            >>> assert abs(results[0] - m.slope) < eps
            >>> eps = 1e-10
            >>> assert abs(results[1] - m.intercept) < eps
            >>> eps = 1e-15
            >>> assert abs(results[2] ** 2 - m.r_squared) < eps
            >>> eps = 1e-15
            >>> assert abs(results[3] - m.slope_pval(0)) < eps


        TODO:
            - support for more complex models (requires examining the coeffs
              matrix to see what's included)

        """

        for k, v in kwargs.items():
            v = np.array(v)
            robjects.globalenv[k] = v
            setattr(self, k, v)

        self.lm = r.lm(formula)
        self.summary = r.summary(self.lm)
        coeffs = self.summary.rx2('coefficients')
        self._intercept_p, self._slope_p = coeffs[6], coeffs[7]
Example #16
0
 def summary(self, equation=None):
     print r.summary(self._estimate, equation=equation)
Example #17
0
 def data_summary(self):
     print r.summary(self.rdata)
Example #18
0
    path = "/install/git/Bioinformatics_paper/胶质母细胞瘤微环境预后相关基因的TCGA数据库挖掘/"
    r.setwd(path)
    # 读取处理好的数据
    sample = pd.read_csv(f"{path}sample.txt", sep="\t", index_col=0)
    sample_Group = sample["Stromal_Group"]
    # 读取处理好的基因表达数据
    HT_HG_U133A_sample = pd.read_csv(f"{path}HT_HG_U133A_sample.txt",
                                     sep="\t").dropna()

    ################# 方差分析(ANOVA) GeneExp_Subtype #################
    # https://www.bioinfo-scrounger.com/archives/588/
    with localconverter(ro.default_converter + pandas2ri.converter):
        ANOVA_data_R = ro.conversion.py2rpy(
            sample[["Stromal_score", "GeneExp_Subtype"]])
        print(
            r.summary(
                r.aov(r("Stromal_score~GeneExp_Subtype"), data=ANOVA_data_R)))

    ################# t检验 IDH1 #################
    r('''suppressMessages(library(MASS))''')
    with localconverter(ro.default_converter + pandas2ri.converter):
        Ttest_data_R = ro.conversion.py2rpy(
            sample[["Stromal_score", "IDH1"]].query("IDH1==1 or IDH1==0"))
        print(r["t.test"](r("Stromal_score~IDH1"), data=Ttest_data_R))

    ################# 生存分析 #################
    # https://www.jianshu.com/p/4ad9ba730719
    # r('''suppressMessages(library(survival))''')
    importr("survival")
    importr("ggfortify")
    with localconverter(ro.default_converter + pandas2ri.converter):
        # 构建生存对象