def ancova(lm1, lm2, names=('lm1', 'lm2')): """ Compares the slopes and intercepts of two linear models. Currently this is quite limited in that it only compares single-variable linear models that have `x` and `y` attributes. Returns (pval of slope difference, pval of intercept difference). Recall that if the slope is significant, you can't really say anything about the intercept. """ # R code, from the extremely useful blog: # http://r-eco-evo.blogspot.com/2011/08/ # comparing-two-regression-slopes-by.html # # model1 = aov(y~x*factor, data=df) # (interaction term on summary(model1)'s 3rd table line) # # model2 = aov(y~x+factor, data=df) # (2nd table line for "factor" in summary(model2) is the sig of intercept # diff) # # anova(model1, model2) # does removing the interaction term affect the model fit? # Construct variables suitable for ANOVA/ANCOVA label1 = [names[0] for i in lm1.x] label2 = [names[1] for i in lm2.x] labels = r.factor(np.array(label1 + label2)) xi = np.concatenate((lm1.x, lm2.x)) yi = np.concatenate((lm1.y, lm2.y)) # The workflow is to populate the formula as a separate environment. # This first formula includes the interaction term fmla1 = robjects.Formula('yi~xi*labels') fmla1.environment['xi'] = xi fmla1.environment['yi'] = yi fmla1.environment['labels'] = labels result1 = r('aov(%s)' % fmla1.r_repr()) interaction_pval = r.summary(result1)[0].rx2('Pr(>F)')[2] # No interaction term fmla2 = robjects.Formula('yi~xi+labels') fmla2.environment['xi'] = xi fmla2.environment['yi'] = yi fmla2.environment['labels'] = labels result2 = r('aov(%s)' % fmla2.r_repr()) intercept_pval = r.summary(result2)[0].rx2('Pr(>F)')[1] # TODO: anova(result1, result2)? return interaction_pval, intercept_pval
def through_the_origin(x, y): df = DataFrame({'x': FloatVector(x), 'y': FloatVector(y)}) s = r.summary(r.lm('y ~ 0 + x', df)) return { 'coefficient': s.rx2('coefficients')[0], 'stderr': s.rx2('coefficients')[1], 'r.squared': s.rx2('r.squared')[0] }
def dirichletreg_df(prop_df, covar_df, formula, onevsrest_category=None, return_reg_input=False): from rpy2.robjects import r, Formula from rpy2.robjects.packages import importr from rpy2.rinterface_lib.callbacks import logger as rpy2_logger dr = importr('DirichletReg') dr_df = pd.concat([prop_df, covar_df], axis=1) f = Formula(formula) rpy2_logger.setLevel( logging.ERROR) # will display errors, but not warnings f.environment['y'] = dr.DR_data(py2r(prop_df)) rpy2_logger.setLevel( logging.WARNING) # will display errors, but not warnings if onevsrest_category is None: fit = dr.DirichReg(f, py2r(dr_df)) else: assert onevsrest_category in prop_df.columns cat_index = prop_df.columns.tolist().index(onevsrest_category) + 1 fit = dr.DirichReg(f, py2r(dr_df), model='alternative', **{'sub.comp': cat_index}) r.sink(file='/dev/null') u = r.summary(fit) r.sink() if r('sink.number')()[0] > 0: r.sink() if onevsrest_category is None: varnames = u.rx2('varnames') else: varnames = [onevsrest_category] * 2 coef_mat = u.rx2('coef.mat') rows = r2py(r('rownames')(coef_mat)) coef_df = r2py(r('as.data.frame')(coef_mat)).reset_index(drop=True) coef_df.columns = ['coefficient', 'se', 'zval', 'pval'] coef_df['compartment'] = np.repeat(varnames, r2py(u.rx2('n.vars'))) coef_df['variable'] = rows coef_df['significance'] = bin_pval(coef_df.pval) if onevsrest_category is not None: coef_df['coef_type'] = np.repeat(['mean', 'precision'], r2py(u.rx2('n.vars'))) if return_reg_input: return dr_df, coef_df else: return coef_df
def clusterize_r_em(*args): """ Clustering and plotting with EM GMM""" try: from rpy2.robjects import r import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() except: print "You need rpy2" sys.exit(-1) r.library("mclust") for arg in args: model = r.Mclust(arg) print model print r.summary(model) r.quartz("plot") r.plot(model, arg) print raw_input("any key to pass")
def km_plot_data(self, name, time, censor, values): values_df = pd.DataFrame( { 'time': time, 'censor': censor, 'value': values }, dtype=float) mean_value = values_df.value.mean() values_df['high'] = values_df.value >= mean_value data = { 'time': robjects.FloatVector(values_df['time']), 'censor': robjects.IntVector(values_df['censor']), 'high': robjects.IntVector(values_df['high']) } df = robjects.DataFrame(data) # p value km_diff = self.surv.survdiff( robjects.Formula('Surv(time, censor) ~ high'), data=df) chisq_ind = list(km_diff.names).index('chisq') pvalue = chi2.sf(km_diff[chisq_ind][0], 1) km = self.surv.survfit(robjects.Formula('Surv(time, censor) ~ high'), data=df) summary = pandas2ri.ri2py(r.summary(km, extend=True)) r.assign('km', km) r.assign('times', data['time']) r.assign('res', r('summary(km, times=times)')) cols = r('lapply(c(2:6, 8:11), function(x) res[x])') r.assign('cols', cols) km_results = r('do.call(data.frame, cols)') km_results = pd.DataFrame(km_results) low_km = km_results[km_results['strata'] == 'high=0'] high_km = km_results[km_results['strata'] == 'high=1'] high_time, high_percent = self.make_plottable_kms( high_km['time'], high_km['surv']) low_time, low_percent = self.make_plottable_kms( low_km['time'], low_km['surv']) high = [{ 'percent': i[0], 'time': i[1] } for i in zip(high_percent, high_time)] low = [{ 'percent': i[0], 'time': i[1] } for i in zip(low_percent, low_time)] return {'high': high, 'low': low, 'p': float('%.4g' % pvalue)}
def clusterize_r_em(*args, **kwargs): """ Clustering and plotting with EM GMM""" try: from rpy2.robjects import r import rpy2.robjects.numpy2ri rpy2.robjects.numpy2ri.activate() from sklearn.decomposition import PCA except: print "You need rpy2" sys.exit(-1) r.library("mclust") for arg in args: if kwargs.get('clf_on_pca', False): pca = PCA(2) arg = pca.fit(arg).transform(arg) model = r.Mclust(arg) print model print r.summary(model) r.quartz("plot") r.plot(model, arg) print raw_input("press any key to pass")
def _r_tobit(self, data, xvars, rbar): """ Estimate tobit with function from r """ r.assign('data', com.convert_to_r_dataframe(data)) rhs = '+'.join(xvars) model = r("vglm(OverallRank ~ "+ rhs +", \ family=tobit(Upper=" + str(rbar) + ", Lower=1), \ data=data, crit='coeff')") if self.opts['verbose']: print(r.summary(model)) out = r.coef(model, matrix=True) out = np.array(out) index = deepcopy(xvars) index.insert(0, 'const') beta = pd.Series(out[:, 0], index=index) return {'beta': beta, 'sigma': out[0, 1]}
def method_spline(rvar, train, test): """ B-splines with interaction """ print("Splines") formula = rvar + ' ~ bs(OverallRank, df=6) + treat + '\ 'treat:bs(OverallRank, df=6) - 1' if rvar == 'Tuition': formula = formula + ' + year' model = r.lm(formula, data=train) #print(r.summary(model).rx2('coefficients')) print(r.summary(model).rx2('r.squared')) #print(r.summary(model)) analytics(rvar, 'Training', train[rvar], np.array(r.predict(model))) if rvar != "UndergraduatemedianGPA": analytics(rvar, 'Testing', test[rvar], np.array(r.predict(model, newdata=test))) print()
def fit(data, outpath=None, verbosity=0, **kwargs): """estimates a multilevel model using the stanarm package in R. Todos: TODO: before converting data to r_data, filter out columns not appearing in formula. """ r_data = pandas2ri.py2ri(data) kwargs['data'] = r_data kwargs['na.action'] = 'na.omit' fit = r.stan_glmer(**kwargs) # TODO: select appropriate prior. if verbosity: print(fit) # print(fit.rx2('linear.predictors')) probs = np.array(fit.rx2('fitted.values')) preds = (probs > 0.5).astype(int) y = np.array(fit.rx2('y')) print('Number of observations: {0}'.format(y.shape[0])) print('Distribution of y:\n{0}'.format(np.bincount(y))) print('Classification report:') print(classification_report(y, preds)) print('R2 Score:\n', r2_score(y, probs)) print('Accuracy:\n', accuracy_score(y, preds)) # r.X11() # r.plot(fit) # r.posterior_interval(fit, prob=0.95, pars='urban') # model evaluation # y_draws = r.posterior_predict(fit) # preds = r.predict(fit) # np.array(r['as.matrix'](fit.rx2('x'))).shape # design matrix # np.array(r['as.matrix'](fit, pars='urban')).shape # posterior parameter draws if verbosity > 1: print(r.summary(fit)) if verbosity > 2: inspect(fit) if outpath is not None: outcome, _, _, _ = get_terms(fit.rx2('formula')[0]) fname = 'mlm_{0}.rds'.format(outcome) save(fit, os.path.join(outpath, fname)) return fit
def plot_forecast(data, fcast): index = pd.date_range(start=data.index.max(), periods=len(fcast[3]) + 1, freq='W')[1:] forecast = pd.Series(fcast[3], index=index) lowerpi = pd.Series(fcast[4], index=index) upperpi = pd.Series(fcast[5], index=index) plt.plot(data.index, data.casos_est, color='b', alpha=0.5) plt.plot(forecast.index, forecast.values, color='red') plt.fill_between(forecast.index, lowerpi.values, upperpi.values, alpha=0.2, color='red') if __name__ == "__main__": data = get_alerta_table(3304557) # Nova Iguaçu: 3303609 tscount = importr('tscount') tsglm = r('tsglm') model = build_model(data) print(r.summary(model)) r.plot(model) # fcast = forecast.forecast(model, h=5, level=95.0) # print(fcast[3], fcast[4], fcast[5]) # plot_forecast(data=data, fcast=fcast) # plt.show()
def data_summary(self): print r.summary(self.rdata)
def summary(self, equation=None): print r.summary(self._estimate, equation=equation)
import rpy2.robjects as ro from rpy2.robjects import pandas2ri from rpy2.robjects import r as R import pandas as pd # Activating R environment pandas2ri.activate() #R = ro.r # Creating a test DataFrame data = {'a' : [1, 2, 3, 4, 5, 6, 7, 8, 9], 'b' : [11, 12, 13, 14, 15, 16, 17, 18, 19], 'c' : [21, 22, 23, 24, 25, 26, 26, 28, 29] } test = pd.DataFrame(data) print(test.head()) M = R.lm('a ~ b', data=test) print(R.summary(M).rx2('coefficients'))
def __init__(self, formula, **kwargs): """ Class for managing linear regression in R. Data are specified with the keyword arguments, which are passed to R's global environment. They are first converted to NumPy arrays. For example, the kwarg `x=[1,2,3,4]` will add the list of four numbers to R's global env with the variable name `x`. You can then access `x` from the formula. `formula` is a string passed verbatim to R's `lm()` function. Example usage:: >>> x = [1, 2, 3, 4] >>> y = [1.2, 3, 7, 10] >>> m = LinearRegression(x=x, y=y, formula='y~x') >>> m.slope 3.0399999999999996 >>> m.intercept -2.299999999999998 >>> m.adj_r_squared 0.97221750212404412 >>> m.slope_pval(0) 0.0093041159117684229 >>> m.intercept_pval(0) 0.10459053583417365 >>> # Variables accessible as NumPy arrays >>> m.x array([1, 2, 3, 4]) Cross-check with scipy.stats.linregress:: >>> from scipy.stats import linregress as scipy_linregress >>> results = scipy_linregress(x, y) >>> eps = 1e-15 >>> assert abs(results[0] - m.slope) < eps >>> eps = 1e-10 >>> assert abs(results[1] - m.intercept) < eps >>> eps = 1e-15 >>> assert abs(results[2] ** 2 - m.r_squared) < eps >>> eps = 1e-15 >>> assert abs(results[3] - m.slope_pval(0)) < eps TODO: - support for more complex models (requires examining the coeffs matrix to see what's included) """ for k, v in kwargs.items(): v = np.array(v) robjects.globalenv[k] = v setattr(self, k, v) self.lm = r.lm(formula) self.summary = r.summary(self.lm) coeffs = self.summary.rx2('coefficients') self._intercept_p, self._slope_p = coeffs[6], coeffs[7]
path = "/install/git/Bioinformatics_paper/胶质母细胞瘤微环境预后相关基因的TCGA数据库挖掘/" r.setwd(path) # 读取处理好的数据 sample = pd.read_csv(f"{path}sample.txt", sep="\t", index_col=0) sample_Group = sample["Stromal_Group"] # 读取处理好的基因表达数据 HT_HG_U133A_sample = pd.read_csv(f"{path}HT_HG_U133A_sample.txt", sep="\t").dropna() ################# 方差分析(ANOVA) GeneExp_Subtype ################# # https://www.bioinfo-scrounger.com/archives/588/ with localconverter(ro.default_converter + pandas2ri.converter): ANOVA_data_R = ro.conversion.py2rpy( sample[["Stromal_score", "GeneExp_Subtype"]]) print( r.summary( r.aov(r("Stromal_score~GeneExp_Subtype"), data=ANOVA_data_R))) ################# t检验 IDH1 ################# r('''suppressMessages(library(MASS))''') with localconverter(ro.default_converter + pandas2ri.converter): Ttest_data_R = ro.conversion.py2rpy( sample[["Stromal_score", "IDH1"]].query("IDH1==1 or IDH1==0")) print(r["t.test"](r("Stromal_score~IDH1"), data=Ttest_data_R)) ################# 生存分析 ################# # https://www.jianshu.com/p/4ad9ba730719 # r('''suppressMessages(library(survival))''') importr("survival") importr("ggfortify") with localconverter(ro.default_converter + pandas2ri.converter): # 构建生存对象