def test_zero_penalty(): x, y, poly = multivariate_sample_data() alphas = [0, 0] gam_gs = GLMGam(y, smoother=poly, alpha=alphas) gam_gs_res = gam_gs.fit() y_est_gam = gam_gs_res.predict() glm = GLM(y, poly.basis).fit() y_est = glm.predict() assert_allclose(y_est, y_est_gam)
def fit_logistic(X_hold,Y_hold,Firth=False,resBase=None,LRtest=True): """ Fits a logistic regression model using standard (when Firth = False) or Firth's method (when Firth = True). resBase is the result of a previous call to a regression that is used to store data for Firth's method. LRtest indicates if the likelihood ratio test should be reported. """ if not Firth: res = GLM(Y_hold, X_hold, family=families.Binomial()).fit()#XXX Confirm this with logistic using older XXXX # AICc adjustment res.aicc = statsmodels.tools.eval_measures.aicc(res.llf, nobs=res.nobs, df_modelwc=res.df_model+1) # Correct BIC res.bic = statsmodels.tools.eval_measures.bic(res.llf, nobs=res.nobs, df_modelwc=res.df_model+1) else: if resBase is None: sys.stderr.write('resBase must be provided to do Firth regression\n') sys.exit(1) elif type(resBase) is not statsmodels.genmod.generalized_linear_model.GLMResultsWrapper: sys.stderr.write('resBase must be type statsmodels.genmod.generalized_linear_model.GLMResultsWrapper\n') sys.exit(2) else: res = resBase #Do Firth's logistic regression (rint, rbeta, rbse, rfitll, pi) = fit_firth(Y_hold, X_hold, start_vec = None) if LRtest: # LRT null_X = np.delete(arr=X_hold,obj=range(int(np.size(X_hold)/len(X_hold)))[1:int(np.size(X_hold)/len(X_hold))],axis=1) (null_intercept, null_beta, null_bse, null_fitll, null_pi) = fit_firth(Y_hold, null_X, start_vec = None) lrstat = -2.*(null_fitll - rfitll) lrt_pvalue = 1. if lrstat > 0.: # non-convergence lrt_pvalue = stats.chi2.sf(lrstat, 1) res.llnull = null_fitll res.lrstat = lrstat res.lrt_pval = lrt_pvalue # AICc adjustment for Firth model aicc = statsmodels.tools.eval_measures.aicc(rfitll, nobs=len(Y_hold), df_modelwc=np.shape(X_hold)[1]) # AIC aic = statsmodels.tools.eval_measures.aic(rfitll, nobs=len(Y_hold), df_modelwc=np.shape(X_hold)[1]) # BIC bic = statsmodels.tools.eval_measures.bic(rfitll, nobs=len(Y_hold), df_modelwc=np.shape(X_hold)[1]) #Store parameters, standard errors, likelihoods, and statistics rint = np.array([rint]) rbeta = np.array(rbeta) res.params = np.concatenate([rint,rbeta]) res.bse = rbse res.llf = rfitll res.aicc = aicc res.aic = aic res.bic = bic #Get Wald p vals for parameters res.pvalues = 1. - chi2.cdf(x=(res.params/res.bse)**2, df=1) #Add predicted y res.predict = pi return res
def test_zero_penalty(): x, y, poly = multivariate_sample_data() alphas = [0, 0] gam_gs = GLMGam(y, smoother=poly, alpha=alphas) gam_gs_res = gam_gs.fit() y_est_gam = gam_gs_res.predict() glm = GLM(y, poly.basis).fit() y_est = glm.predict() assert_allclose(y_est, y_est_gam)
def test_predict(self): np.random.seed(382304) endog = np.random.randint(0, 10, 100) exog = np.random.normal(size=(100, 3)) exposure = np.random.uniform(1, 2, 100) mod1 = GLM(endog, exog, family=sm.families.Poisson(), exposure=exposure).fit() exog1 = np.random.normal(size=(10, 3)) exposure1 = np.random.uniform(1, 2, 10) # Doubling exposure time should double expected response pred1 = mod1.predict(exog=exog1, exposure=exposure1) pred2 = mod1.predict(exog=exog1, exposure=2 * exposure1) assert_almost_equal(pred2, 2 * pred1) # Check exposure defaults pred3 = mod1.predict() pred4 = mod1.predict(exposure=exposure) pred5 = mod1.predict(exog=exog, exposure=exposure) assert_almost_equal(pred3, pred4) assert_almost_equal(pred4, pred5) # Check offset defaults offset = np.random.uniform(1, 2, 100) mod2 = GLM(endog, exog, offset=offset, family=sm.families.Poisson()).fit() pred1 = mod2.predict() pred2 = mod2.predict(offset=offset) pred3 = mod2.predict(exog=exog, offset=offset) assert_almost_equal(pred1, pred2) assert_almost_equal(pred2, pred3) # Check that offset shifts the linear predictor mod3 = GLM(endog, exog, family=sm.families.Poisson()).fit() offset = np.random.uniform(1, 2, 10) pred1 = mod3.predict(exog=exog1, offset=offset, linear=True) pred2 = mod3.predict(exog=exog1, offset=2 * offset, linear=True) assert_almost_equal(pred2, pred1 + offset)
def test_predict(self): np.random.seed(382304) endog = np.random.randint(0, 10, 100) exog = np.random.normal(size=(100,3)) exposure = np.random.uniform(1, 2, 100) mod1 = GLM(endog, exog, family=sm.families.Poisson(), exposure=exposure).fit() exog1 = np.random.normal(size=(10,3)) exposure1 = np.random.uniform(1, 2, 10) # Doubling exposure time should double expected response pred1 = mod1.predict(exog=exog1, exposure=exposure1) pred2 = mod1.predict(exog=exog1, exposure=2*exposure1) assert_almost_equal(pred2, 2*pred1) # Check exposure defaults pred3 = mod1.predict() pred4 = mod1.predict(exposure=exposure) pred5 = mod1.predict(exog=exog, exposure=exposure) assert_almost_equal(pred3, pred4) assert_almost_equal(pred4, pred5) # Check offset defaults offset = np.random.uniform(1, 2, 100) mod2 = GLM(endog, exog, offset=offset, family=sm.families.Poisson()).fit() pred1 = mod2.predict() pred2 = mod2.predict(offset=offset) pred3 = mod2.predict(exog=exog, offset=offset) assert_almost_equal(pred1, pred2) assert_almost_equal(pred2, pred3) # Check that offset shifts the linear predictor mod3 = GLM(endog, exog, family=sm.families.Poisson()).fit() offset = np.random.uniform(1, 2, 10) pred1 = mod3.predict(exog=exog1, offset=offset, linear=True) pred2 = mod3.predict(exog=exog1, offset=2*offset, linear=True) assert_almost_equal(pred2, pred1+offset)
def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, nbins=30, alpha=0): """ Calculate local FDR values for a list of Z-scores. Parameters ---------- zscores : array_like A vector of Z-scores null_proportion : float The assumed proportion of true null hypotheses null_pdf : function mapping reals to positive reals The density of null Z-scores; if None, use standard normal deg : int The maximum exponent in the polynomial expansion of the density of non-null Z-scores nbins : int The number of bins for estimating the marginal density of Z-scores. alpha : float Use Poisson ridge regression with parameter alpha to estimate the density of non-null Z-scores. Returns ------- fdr : array_like A vector of FDR values References ---------- B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups Model. Statistical Science 23:1, 1-22. Examples -------- Basic use (the null Z-scores are taken to be standard normal): >>> from statsmodels.stats.multitest import local_fdr >>> import numpy as np >>> zscores = np.random.randn(30) >>> fdr = local_fdr(zscores) Use a Gaussian null distribution estimated from the data: >>> null = EmpiricalNull(zscores) >>> fdr = local_fdr(zscores, null_pdf=null.pdf) """ from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.generalized_linear_model import families from statsmodels.regression.linear_model import OLS # Bins for Poisson modeling of the marginal Z-score density minz = min(zscores) maxz = max(zscores) bins = np.linspace(minz, maxz, nbins) # Bin counts zhist = np.histogram(zscores, bins)[0] # Bin centers zbins = (bins[:-1] + bins[1:]) / 2 # The design matrix at bin centers dmat = np.vander(zbins, deg + 1) # Rescale the design matrix sd = dmat.std(0) ii = sd > 1e-8 dmat[:, ii] /= sd[ii] start = OLS(np.log(1 + zhist), dmat).fit().params # Poisson regression if alpha > 0: md = GLM(zhist, dmat, family=families.Poisson()).fit_regularized(L1_wt=0, alpha=alpha, start_params=start) else: md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=start) # The design matrix for all Z-scores dmat_full = np.vander(zscores, deg + 1) dmat_full[:, ii] /= sd[ii] # The height of the estimated marginal density of Z-scores, # evaluated at every observed Z-score. fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0])) # The null density. if null_pdf is None: f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi) else: f0 = null_pdf(zscores) # The local FDR values fdr = null_proportion * f0 / fz fdr = np.clip(fdr, 0, 1) return fdr
def interactplot(x1, x2, y, data=None, filled=False, cmap="RdBu_r", colorbar=True, levels=30, logistic=False, contour_kws=None, scatter_kws=None, ax=None, **kwargs): """Visualize a continuous two-way interaction with a contour plot. Parameters ---------- x1, x2, y, strings or array-like Either the two independent variables and the dependent variable, or keys to extract them from `data` data : DataFrame Pandas DataFrame with the data in the columns. filled : bool Whether to plot with filled or unfilled contours cmap : matplotlib colormap Colormap to represent yhat in the countour plot. colorbar : bool Whether to draw the colorbar for interpreting the color values. levels : int or sequence Number or position of contour plot levels. logistic : bool Fit a logistic regression model instead of linear regression. contour_kws : dictionary Keyword arguments for contour[f](). scatter_kws : dictionary Keyword arguments for plot(). ax : matplotlib axis Axis to draw plot in. Returns ------- ax : Matplotlib axis Axis with the contour plot. """ msg = ( "The `interactplot` function has been deprecated and will be removed " "in a future version." ) warnings.warn(msg, UserWarning) if not _has_statsmodels: raise ImportError("The `interactplot` function requires statsmodels") from statsmodels.regression.linear_model import OLS from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.families import Binomial # Handle the form of the data if data is not None: x1 = data[x1] x2 = data[x2] y = data[y] if hasattr(x1, "name"): xlabel = x1.name else: xlabel = None if hasattr(x2, "name"): ylabel = x2.name else: ylabel = None if hasattr(y, "name"): clabel = y.name else: clabel = None x1 = np.asarray(x1) x2 = np.asarray(x2) y = np.asarray(y) # Initialize the scatter keyword dictionary if scatter_kws is None: scatter_kws = {} if not ("color" in scatter_kws or "c" in scatter_kws): scatter_kws["color"] = "#222222" if "alpha" not in scatter_kws: scatter_kws["alpha"] = 0.75 # Intialize the contour keyword dictionary if contour_kws is None: contour_kws = {} # Initialize the axis if ax is None: ax = plt.gca() # Plot once to let matplotlib sort out the axis limits ax.plot(x1, x2, "o", **scatter_kws) # Find the plot limits x1min, x1max = ax.get_xlim() x2min, x2max = ax.get_ylim() # Make the grid for the contour plot x1_points = np.linspace(x1min, x1max, 100) x2_points = np.linspace(x2min, x2max, 100) xx1, xx2 = np.meshgrid(x1_points, x2_points) # Fit the model with an interaction X = np.c_[np.ones(x1.size), x1, x2, x1 * x2] if logistic: lm = GLM(y, X, family=Binomial()).fit() else: lm = OLS(y, X).fit() # Evaluate the model on the grid eval = np.vectorize(lambda x1_, x2_: lm.predict([1, x1_, x2_, x1_ * x2_])) yhat = eval(xx1, xx2) # Default color limits put the midpoint at mean(y) y_bar = y.mean() c_min = min(np.percentile(y, 2), yhat.min()) c_max = max(np.percentile(y, 98), yhat.max()) delta = max(c_max - y_bar, y_bar - c_min) c_min, cmax = y_bar - delta, y_bar + delta contour_kws.setdefault("vmin", c_min) contour_kws.setdefault("vmax", c_max) # Draw the contour plot func_name = "contourf" if filled else "contour" contour = getattr(ax, func_name) c = contour(xx1, xx2, yhat, levels, cmap=cmap, **contour_kws) # Draw the scatter again so it's visible ax.plot(x1, x2, "o", **scatter_kws) # Draw a colorbar, maybe if colorbar: bar = plt.colorbar(c) # Label the axes if xlabel is not None: ax.set_xlabel(xlabel) if ylabel is not None: ax.set_ylabel(ylabel) if clabel is not None and colorbar: clabel = "P(%s)" % clabel if logistic else clabel bar.set_label(clabel, labelpad=15, rotation=270) return ax
def interactplot(x1, x2, y, data=None, filled=False, cmap="RdBu_r", colorbar=True, levels=30, logistic=False, contour_kws=None, scatter_kws=None, ax=None, **kwargs): """Visualize a continuous two-way interaction with a contour plot. Parameters ---------- x1, x2, y, strings or array-like Either the two independent variables and the dependent variable, or keys to extract them from `data` data : DataFrame Pandas DataFrame with the data in the columns. filled : bool Whether to plot with filled or unfilled contours cmap : matplotlib colormap Colormap to represent yhat in the countour plot. colorbar : bool Whether to draw the colorbar for interpreting the color values. levels : int or sequence Number or position of contour plot levels. logistic : bool Fit a logistic regression model instead of linear regression. contour_kws : dictionary Keyword arguments for contour[f](). scatter_kws : dictionary Keyword arguments for plot(). ax : matplotlib axis Axis to draw plot in. Returns ------- ax : Matplotlib axis Axis with the contour plot. """ msg = ( "The `interactplot` function has been deprecated and will be removed " "in a future version.") warnings.warn(msg, UserWarning) if not _has_statsmodels: raise ImportError("The `interactplot` function requires statsmodels") from statsmodels.regression.linear_model import OLS from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.families import Binomial # Handle the form of the data if data is not None: x1 = data[x1] x2 = data[x2] y = data[y] if hasattr(x1, "name"): xlabel = x1.name else: xlabel = None if hasattr(x2, "name"): ylabel = x2.name else: ylabel = None if hasattr(y, "name"): clabel = y.name else: clabel = None x1 = np.asarray(x1) x2 = np.asarray(x2) y = np.asarray(y) # Initialize the scatter keyword dictionary if scatter_kws is None: scatter_kws = {} if not ("color" in scatter_kws or "c" in scatter_kws): scatter_kws["color"] = "#222222" if "alpha" not in scatter_kws: scatter_kws["alpha"] = 0.75 # Intialize the contour keyword dictionary if contour_kws is None: contour_kws = {} # Initialize the axis if ax is None: ax = plt.gca() # Plot once to let matplotlib sort out the axis limits ax.plot(x1, x2, "o", **scatter_kws) # Find the plot limits x1min, x1max = ax.get_xlim() x2min, x2max = ax.get_ylim() # Make the grid for the contour plot x1_points = np.linspace(x1min, x1max, 100) x2_points = np.linspace(x2min, x2max, 100) xx1, xx2 = np.meshgrid(x1_points, x2_points) # Fit the model with an interaction X = np.c_[np.ones(x1.size), x1, x2, x1 * x2] if logistic: lm = GLM(y, X, family=Binomial()).fit() else: lm = OLS(y, X).fit() # Evaluate the model on the grid eval = np.vectorize(lambda x1_, x2_: lm.predict([1, x1_, x2_, x1_ * x2_])) yhat = eval(xx1, xx2) # Default color limits put the midpoint at mean(y) y_bar = y.mean() c_min = min(np.percentile(y, 2), yhat.min()) c_max = max(np.percentile(y, 98), yhat.max()) delta = max(c_max - y_bar, y_bar - c_min) c_min, cmax = y_bar - delta, y_bar + delta contour_kws.setdefault("vmin", c_min) contour_kws.setdefault("vmax", c_max) # Draw the contour plot func_name = "contourf" if filled else "contour" contour = getattr(ax, func_name) c = contour(xx1, xx2, yhat, levels, cmap=cmap, **contour_kws) # Draw the scatter again so it's visible ax.plot(x1, x2, "o", **scatter_kws) # Draw a colorbar, maybe if colorbar: bar = plt.colorbar(c) # Label the axes if xlabel is not None: ax.set_xlabel(xlabel) if ylabel is not None: ax.set_ylabel(ylabel) if clabel is not None and colorbar: clabel = "P(%s)" % clabel if logistic else clabel bar.set_label(clabel, labelpad=15, rotation=270) return ax