def __init__(self): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs, 6) data_exog = rvs data_exog = sm.add_constant(data_exog) xbeta = 1 + 0.1 * rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) #estimate discretemod.Poisson as benchmark self.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) self.res_glm = mod_glm.fit() #estimate generic MLE #self.mod = PoissonGMLE(data_endog, data_exog) #res = self.mod.fit() offset = self.res_discrete.params[0] * data_exog[:, 0] #1d ??? #self.res = PoissonOffsetGMLE(data_endog, data_exog[:,1:], offset=offset).fit(start_params = np.ones(6)/2., method='nm') modo = PoissonOffsetGMLE(data_endog, data_exog[:, 1:], offset=offset) self.res = modo.fit(start_params=0.9 * self.res_discrete.params[1:], method='nm', disp=0)
def __init__(self): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs,6) data_exog = rvs data_exog = sm.add_constant(data_exog) xbeta = 1 + 0.1*rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) #estimate discretemod.Poisson as benchmark self.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) self.res_glm = mod_glm.fit() #estimate generic MLE #self.mod = PoissonGMLE(data_endog, data_exog) #res = self.mod.fit() offset = self.res_discrete.params[0] * data_exog[:,0] #1d ??? #self.res = PoissonOffsetGMLE(data_endog, data_exog[:,1:], offset=offset).fit(start_params = np.ones(6)/2., method='nm') modo = PoissonOffsetGMLE(data_endog, data_exog[:,1:], offset=offset) self.res = modo.fit(start_params = 0.9*self.res_discrete.params[1:], method='nm', disp=0)
def checkOLS(self, exog, endog, x, y): try: import scikits.statsmodels.api as sm except ImportError: import scikits.statsmodels as sm reference = sm.OLS(endog, sm.add_constant(exog)).fit() result = ols(y=y, x=x) assert_almost_equal(reference.params, result._beta_raw) assert_almost_equal(reference.df_model, result._df_model_raw) assert_almost_equal(reference.df_resid, result._df_resid_raw) assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) assert_almost_equal(reference.pvalues, result._p_value_raw) assert_almost_equal(reference.rsquared, result._r2_raw) assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) assert_almost_equal(reference.resid, result._resid_raw) assert_almost_equal(reference.bse, result._std_err_raw) assert_almost_equal(reference.t(), result._t_stat_raw) assert_almost_equal(reference.cov_params(), result._var_beta_raw) assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) _check_non_raw_results(result)
def setupClass(cls): data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog) res2 = Spector() res2.probit() cls.res2 = res2 cls.res1 = Probit(data.endog, data.exog).fit(method="ncg", disp=0, avextol=1e-8)
def linmod(y, x, weights=None, sigma=None, add_const=True, filter_missing=True, **kwds): '''get linear model with extra options for entry dispatches to regular model class and does not wrap the output If several options are exclusive, for example sigma and weights, then the chosen class depends on the implementation sequence. ''' if filter_missing: y, x = remove_nanrows(y, x) #do the same for masked arrays if add_const: x = sm.add_constant(x, prepend=True) if not sigma is None: return GLS(y, x, sigma=sigma, **kwds) elif not weights is None: return WLS(y, x, weights=weights, **kwds) else: return OLS(y, x, **kwds)
def __init__(self): data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog) #mod = sm.Probit(data.endog, data.exog) self.mod = sm.Logit(data.endog, data.exog) #res = mod.fit(method="newton") self.params = [np.array([1,0.25,1.4,-7])]
def setupClass(cls): data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog) cls.res1 = Logit(data.endog, data.exog).fit(method="newton", disp=0) res2 = Spector() res2.logit() cls.res2 = res2
class TestRlm(CheckRlmResults): from scikits.statsmodels.datasets.stackloss import load data = load() # class attributes for subclasses data.exog = sm.add_constant(data.exog) def __init__(self): # Test precisions self.decimal_standarderrors = DECIMAL_1 self.decimal_scale = DECIMAL_3 results = RLM(self.data.endog, self.data.exog,\ M=sm.robust.norms.HuberT()).fit() # default M h2 = RLM(self.data.endog, self.data.exog,\ M=sm.robust.norms.HuberT()).fit(cov="H2").bcov_scaled h3 = RLM(self.data.endog, self.data.exog,\ M=sm.robust.norms.HuberT()).fit(cov="H3").bcov_scaled self.res1 = results self.res1.h2 = h2 self.res1.h3 = h3 def setup(self): # r.library('MASS') # self.res2 = RModel(self.data.endog, self.data.exog, # r.rlm, psi="psi.huber") from results.results_rlm import Huber self.res2 = Huber()
def age_design(indices): tmp = np.hstack((sm.categorical(hrdat['sex'][indices])[:,2:], sm.categorical(hrdat['educ'][indices])[:,2:], sm.categorical(hrdat['PTFT'][indices])[:,2:], hrdat['age'].reshape(n,1)[indices,:], (hrdat['age']**2).reshape(n,1)[indices,:])) return sm.add_constant(tmp, prepend = True)
def test_HC_use(): np.random.seed(0) nsample = 100 x = np.linspace(0,10, 100) X = sm.add_constant(np.column_stack((x, x**2)), prepend=False) beta = np.array([1, 0.1, 10]) y = np.dot(X, beta) + np.random.normal(size=nsample) results = sm.OLS(y, X).fit() #test cov_params idx = np.array([1,2]) #need to call HC0_se to have cov_HC0 available results.HC0_se cov12 = results.cov_params(column=[1,2], cov_p=results.cov_HC0) assert_almost_equal(cov12, results.cov_HC0[idx[:,None], idx], decimal=15) #test t_test tvals = results.params/results.HC0_se ttest = results.t_test(np.eye(3), cov_p=results.cov_HC0) assert_almost_equal(ttest.tvalue, tvals, decimal=14) assert_almost_equal(ttest.sd, results.HC0_se, decimal=14) #test f_test ftest = results.f_test(np.eye(3)[:-1], cov_p=results.cov_HC0) slopes = results.params[:-1] idx = np.array([0,1]) cov_slopes = results.cov_HC0[idx[:,None], idx] fval = np.dot(slopes, np.linalg.inv(cov_slopes).dot(slopes))/len(idx) assert_almost_equal(ftest.fvalue, fval, decimal=12)
def test_qqplot(self): #just test that it runs data = sm.datasets.longley.load() data.exog = sm.add_constant(data.exog) mod_fit = sm.OLS(data.endog, data.exog).fit() res = mod_fit.resid fig = sm.qqplot(res) plt.close(fig)
def setupClass(cls): from results.results_discrete import RandHIE data = sm.datasets.randhie.load() exog = sm.add_constant(data.exog.view((float, 9))) cls.res1 = Poisson(data.endog, exog).fit(method='newton', disp=0) res2 = RandHIE() res2.poisson() cls.res2 = res2
def run_WLS(): import scikits.statsmodels.api as sm res = sm.WLS(y, sm.add_constant(x, prepend=True), weights=1. / sigma ** 2).fit() print ('statsmodels.api.WLS') print('popt: {0}'.format(res.params)) print('perr: {0}'.format(res.bse)) return res
def setupClass(cls): from results.results_discrete import RandHIE data = sm.datasets.randhie.load() exog = sm.add_constant(data.exog.view((float,9))) cls.res1 = Poisson(data.endog, exog).fit(method='newton', disp=0) res2 = RandHIE() res2.poisson() cls.res2 = res2
def setupClass(cls): if iswindows: # does this work with classmethod? raise SkipTest("fmin_cg sometimes fails to converge on windows") data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog) res2 = Spector() res2.probit() cls.res2 = res2 cls.res1 = Probit(data.endog, data.exog).fit(method="cg", disp=0, maxiter=250)
def quadratic_term(list_of_mean, list_of_var): """Fit a quadratic term and return its p-value""" # Remove records with 0 variance log_var = [np.log(x) for x in list_of_var if x > 0] log_mean = [np.log(list_of_mean[i]) for i in range(len(list_of_mean)) if list_of_var[i] > 0] log_mean_quad = [x ** 2 for x in log_mean] indep_var = np.column_stack((log_mean, log_mean_quad)) indep_var = sm.add_constant(indep_var, prepend = True) quad_res = sm.OLS(log_var, indep_var).fit() return quad_res.pvalues[2]
def explain_rseq_by_rfreq_and_copy(): r_rseqs = [motif_ic(getattr(Escherichia_coli,tf)) for tf in Escherichia_coli.tfs if tf in copy_numbers] r_rfreqs = [log2(4.6*10**6/len(getattr(Escherichia_coli,tf))) for tf in Escherichia_coli.tfs if tf in copy_numbers] copies = [copy_numbers[tf] for tf in Escherichia_coli.tfs if tf in copy_numbers] log_copies = map(log2,copies) X = sm.add_constant(np.column_stack((r_rfreqs,log_copies)),prepend=True) res = sm.OLS(r_rseqs,X).fit() print res.summary()
def age_design(indices): tmp = np.hstack( ( sm.categorical(hrdat["sex"][indices])[:, 2:], sm.categorical(hrdat["educ"][indices])[:, 2:], sm.categorical(hrdat["PTFT"][indices])[:, 2:], hrdat["age"].reshape(n, 1)[indices, :], (hrdat["age"] ** 2).reshape(n, 1)[indices, :], ) ) return sm.add_constant(tmp, prepend=True)
def setupClass(cls): from results.results_discrete import Anes data = sm.datasets.anes96.load() exog = data.exog exog[:, 0] = np.log(exog[:, 0] + .1) exog = np.column_stack((exog[:, 0], exog[:, 2], exog[:, 5:8])) exog = sm.add_constant(exog) cls.res1 = MNLogit(data.endog, exog).fit(method="newton", disp=0) res2 = Anes() res2.mnlogit_basezero() cls.res2 = res2
def cm_test(X): """ Conditional moment test. X is a flat numpy array. """ betahat, alphahat, shat = ar1_functions.fit(X) n = len(X) xL = X[:(n-1)] # All but the last one xF = X[1:] # All but the first one Z = (xF - betahat - alphahat * xL)**2 XX = sm.add_constant(xL) out = sm.OLS(Z, XX).fit() return np.abs(out.tvalues[0]) > 1.96
def test_perfect_prediction(): cur_dir = os.path.dirname(os.path.abspath(__file__)) iris_dir = os.path.join(cur_dir, "..", "..", "genmod", "tests", "results") iris_dir = os.path.abspath(iris_dir) iris = np.genfromtxt(os.path.join(iris_dir, "iris.csv"), delimiter=",", skip_header=1) y = iris[:, -1] X = iris[:, :-1] X = X[y != 2] y = y[y != 2] X = sm.add_constant(X, prepend=True) mod = Logit(y, X) assert_raises(PerfectSeparationError, mod.fit)
def setupClass(cls): from results.results_discrete import Anes data = sm.datasets.anes96.load() exog = data.exog exog[:,0] = np.log(exog[:,0] + .1) exog = np.column_stack((exog[:,0],exog[:,2], exog[:,5:8])) exog = sm.add_constant(exog) cls.res1 = MNLogit(data.endog, exog).fit(method="newton", disp=0) res2 = Anes() res2.mnlogit_basezero() cls.res2 = res2
def test_perfect_prediction(): cur_dir = os.path.dirname(os.path.abspath(__file__)) iris_dir = os.path.join(cur_dir, '..', '..', 'genmod', 'tests', 'results') iris_dir = os.path.abspath(iris_dir) iris = np.genfromtxt(os.path.join(iris_dir, 'iris.csv'), delimiter=",", skip_header=1) y = iris[:,-1] X = iris[:,:-1] X = X[y != 2] y = y[y != 2] X = sm.add_constant(X, prepend=True) mod = Logit(y,X) assert_raises(PerfectSeparationError, mod.fit)
def setup(self): nsample = 100 sig = 0.5 x1 = np.linspace(0, 20, nsample) x2 = 5 + 3 * np.random.randn(nsample) X = np.c_[x1, x2, np.sin(0.5 * x1), (x2 - 5)**2, np.ones(nsample)] beta = [0.5, 0.5, 1, -0.04, 5.] y_true = np.dot(X, beta) y = y_true + sig * np.random.normal(size=nsample) exog0 = sm.add_constant(np.c_[x1, x2], prepend=False) res = sm.OLS(y, exog0).fit() self.res = res
def setup(self): nsample = 100 sig = 0.5 x1 = np.linspace(0, 20, nsample) x2 = 5 + 3* np.random.randn(nsample) X = np.c_[x1, x2, np.sin(0.5*x1), (x2-5)**2, np.ones(nsample)] beta = [0.5, 0.5, 1, -0.04, 5.] y_true = np.dot(X, beta) y = y_true + sig * np.random.normal(size=nsample) exog0 = sm.add_constant(np.c_[x1, x2], prepend=False) res = sm.OLS(y, exog0).fit() self.res = res
def __init__(self): #from results.results_discrete import Anes data = sm.datasets.anes96.load() exog = data.exog exog[:,0] = np.log(exog[:,0] + .1) exog = np.column_stack((exog[:,0],exog[:,2], exog[:,5:8])) exog = sm.add_constant(exog) self.mod = sm.MNLogit(data.endog, exog) def loglikeflat(self, params): #reshapes flattened params return self.loglike(params.reshape(6,6)) self.mod.loglike = loglikeflat #need instance method self.params = [np.ones((6,6))]
def setupClass(cls): # import scipy # major, minor, micro = scipy.__version__.split('.')[:3] # if int(minor) < 9: # raise SkipTest #Skip this unconditionally for release 0.3.0 #since there are still problems with scipy 0.9.0 on some machines #Ralf on mailing list 2011-03-26 raise SkipTest data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog) res2 = Spector() res2.logit() cls.res2 = res2 cls.res1 = Logit(data.endog, data.exog).fit(method="bfgs", disp=0)
def setupClass(cls): # import scipy # major, minor, micro = scipy.__version__.split('.')[:3] # if int(minor) < 9: # raise SkipTest # Skip this unconditionally for release 0.3.0 # since there are still problems with scipy 0.9.0 on some machines # Ralf on mailing list 2011-03-26 raise SkipTest data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog) res2 = Spector() res2.logit() cls.res2 = res2 cls.res1 = Logit(data.endog, data.exog).fit(method="bfgs", disp=0)
def linear_fit_robust(x, y, return_coef=False): """ Fit a straight-line by robust regression (M-estimate). If `return_coef=True` returns the slope (m) and intercept (c). """ import scikits.statsmodels.api as sm ind, = np.where((~np.isnan(x)) & (~np.isnan(y))) x, y = x[ind], y[ind] X = sm.add_constant(x, prepend=False) y_model = sm.RLM(y, X, M=sm.robust.norms.HuberT()) y_fit = y_model.fit() if return_coef: if len(y_fit.params) < 2: return (y_fit.params[0], 0.) else: return y_fit.params[:] else: return (x, y_fit.fittedvalues)
def regression_analysis(play_arr,dataFunction1,dataFunction2): totalBefore = [] totalAfter = [] for weekNum in range(10,15): Before,After = regression_weekly(play_arr,weekNum,dataFunction1, dataFunction2) totalBefore = np.concatenate([totalBefore, Before]) totalAfter = np.concatenate([totalAfter, After]) slope, intercept, r_value, p_value, err = stats.linregress(totalBefore, totalAfter) results = sm.OLS(totalAfter, sm.add_constant(totalBefore)).fit() print results.summary() plt.plot(totalBefore, totalAfter, '.') X_plot = np.linspace(0, 1, 100) plt.plot(X_plot, X_plot * results.params[0] + results.params[1]) plt.show()
def _check_wls(self, x, y, weights): result = ols(y=y, x=x, weights=1/weights) combined = x.copy() combined['__y__'] = y combined['__weights__'] = weights combined = combined.dropna() endog = combined.pop('__y__').values aweights = combined.pop('__weights__').values exog = sm.add_constant(combined.values, prepend=False) sm_result = sm.WLS(endog, exog, weights=1/aweights).fit() assert_almost_equal(sm_result.params, result._beta_raw) assert_almost_equal(sm_result.resid, result._resid_raw) self.checkMovingOLS('rolling', x, y, weights=weights) self.checkMovingOLS('expanding', x, y, weights=weights)
def regression_analysis(play_arr, dataFunction1, dataFunction2): totalBefore = [] totalAfter = [] for weekNum in range(10, 15): Before, After = regression_weekly(play_arr, weekNum, dataFunction1, dataFunction2) totalBefore = np.concatenate([totalBefore, Before]) totalAfter = np.concatenate([totalAfter, After]) slope, intercept, r_value, p_value, err = stats.linregress( totalBefore, totalAfter) results = sm.OLS(totalAfter, sm.add_constant(totalBefore)).fit() print results.summary() plt.plot(totalBefore, totalAfter, '.') X_plot = np.linspace(0, 1, 100) plt.plot(X_plot, X_plot * results.params[0] + results.params[1]) plt.show()
def __init__(self): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs,6) data_exog = rvs data_exog = sm.add_constant(data_exog) xbeta = 0.1 + 0.1*rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) #estimate discretemod.Poisson as benchmark self.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) self.res_glm = mod_glm.fit() #estimate generic MLE self.mod = PoissonGMLE(data_endog, data_exog) self.res = self.mod.fit(start_params=0.9 * self.res_discrete.params, method='nm', disp=0)
def calc_factors(self, x=None, keepdim=0, addconst=True): '''get factor decomposition of exogenous variables This uses principal component analysis to obtain the factors. The number of factors kept is the maximum that will be considered in the regression. ''' if x is None: x = self.exog else: x = np.asarray(x) xred, fact, evals, evecs = pca(x, keepdim=keepdim, normalize=1) self.exog_reduced = xred #self.factors = fact if addconst: self.factors = sm.add_constant(fact, prepend=True) self.hasconst = 1 #needs to be int else: self.factors = fact self.hasconst = 0 #needs to be int self.evals = evals self.evecs = evecs
class TestRlmHuber(CheckRlmResults): from scikits.statsmodels.datasets.stackloss import load data = load() data.exog = sm.add_constant(data.exog) def __init__(self): results = RLM(self.data.endog, self.data.exog,\ M=sm.robust.norms.HuberT()).fit(scale_est=\ sm.robust.scale.HuberScale()) h2 = RLM(self.data.endog, self.data.exog,\ M=sm.robust.norms.HuberT()).fit(cov="H2", scale_est=sm.robust.scale.HuberScale()).bcov_scaled h3 = RLM(self.data.endog, self.data.exog,\ M=sm.robust.norms.HuberT()).fit(cov="H3", scale_est=sm.robust.scale.HuberScale()).bcov_scaled self.res1 = results self.res1.h2 = h2 self.res1.h3 = h3 def setup(self): from results.results_rlm import HuberHuber self.res2 = HuberHuber()
def checkOLS(self, exog, endog, x, y): reference = sm.OLS(endog, sm.add_constant(exog, prepend=False)).fit() result = ols(y=y, x=x) # check that sparse version is the same sparse_result = ols(y=y.to_sparse(), x=x.to_sparse()) _compare_ols_results(result, sparse_result) assert_almost_equal(reference.params, result._beta_raw) assert_almost_equal(reference.df_model, result._df_model_raw) assert_almost_equal(reference.df_resid, result._df_resid_raw) assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) assert_almost_equal(reference.pvalues, result._p_value_raw) assert_almost_equal(reference.rsquared, result._r2_raw) assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) assert_almost_equal(reference.resid, result._resid_raw) assert_almost_equal(reference.bse, result._std_err_raw) assert_almost_equal(reference.tvalues, result._t_stat_raw) assert_almost_equal(reference.cov_params(), result._var_beta_raw) assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) _check_non_raw_results(result)
def __init__(self): # generate artificial data np.random.seed(98765678) nobs = 200 rvs = np.random.randn(nobs, 6) data_exog = rvs data_exog = sm.add_constant(data_exog) xbeta = 0.1 + 0.1 * rvs.sum(1) data_endog = np.random.poisson(np.exp(xbeta)) #estimate discretemod.Poisson as benchmark self.res_discrete = Poisson(data_endog, data_exog).fit(disp=0) mod_glm = sm.GLM(data_endog, data_exog, family=sm.families.Poisson()) self.res_glm = mod_glm.fit() #estimate generic MLE self.mod = PoissonGMLE(data_endog, data_exog) self.res = self.mod.fit(start_params=0.9 * self.res_discrete.params, method='nm', disp=0)
def setupClass(cls): cls.decimal_resids = DECIMAL_1 # working resids off a bit in outlier cls.decimal_fittedvalues = DECIMAL_3 # ditto data = sm.datasets.wfs.load() offset = np.log(data.exog[:,-1]) exog = data.exog[:,:-1] # convert dur to dummy exog = sm.tools.categorical(exog, col=0, drop=True) # drop reference category # convert res to dummy exog = sm.tools.categorical(exog, col=0, drop=True) # convert edu to dummy exog = sm.tools.categorical(exog, col=0, drop=True) # drop reference categories and add intercept exog = sm.add_constant(exog[:,[1,2,3,4,5,7,8,10,11,12]]) endog = np.round(data.endog) cls.res1 = GLM(endog, exog, family=sm.families.Poisson(), offset=offset).fit(tol=1e-12, maxiter=250) from results.results_glm import Wfs cls.res2 = Wfs()
nes = nes[nes[:,0] == 1992] # get only the data for 1992 # get only non Nan data nes = nes[(nes[:,2] < 3) | numpy.isnan(nes[:,2])] # get where presvote < 3 or not nan nes[:,2] -= 1 # convert pres vals into 0 or 1 for republicans # like gelman now nes = nes[numpy.isnan(nes[:,2]) == False] nes = nes[numpy.isnan(nes[:,1]) == False] # drop nans exog = nes[:,1] endog = nes[:,2] #endog, exog = sm.tools.drop_missing(endog, exog) exog = sm.add_constant(exog) print exog.shape print endog.shape print exog print endog logit_mod = sm.Logit(endog, exog) logit_res = logit_mod.fit() print logit_res.params print logit_res.bse print logit_res.prsquared print logit_res.margeff() print logit_res.conf_int() print logit_res.df_resid
#choose example #-------------- example = ['null', 'smalldiff', 'mediumdiff', 'largediff'][1] example_size = [20, 100][1] example_groups = ['2', '2-2'][1] #'2-2': 4 groups, # groups 0 and 1 and groups 2 and 3 have identical parameters in DGP #generate example #---------------- #np.random.seed(87654589) nobs = example_size x1 = 0.1 + np.random.randn(nobs) y1 = 10 + 15 * x1 + 2 * np.random.randn(nobs) x1 = sm.add_constant(x1) #, prepend=True) #assert_almost_equal(x1, np.vander(x1[:,0],2), 16) #res1 = sm.OLS(y1, x1).fit() #print res1.params #print np.polyfit(x1[:,0], y1, 1) #assert_almost_equal(res1.params, np.polyfit(x1[:,0], y1, 1), 14) #print res1.summary(xname=['x1','const1']) #regression 2 x2 = 0.1 + np.random.randn(nobs) if example == 'null': y2 = 10 + 15 * x2 + 2 * np.random.randn(nobs) # if H0 is true elif example == 'smalldiff': y2 = 11 + 16 * x2 + 2 * np.random.randn(nobs) elif example == 'mediumdiff': y2 = 12 + 16 * x2 + 2 * np.random.randn(nobs)
print len(indf) print len(indm) # With each of these models, typically do some # commands to look more at the models, like summary(), # , anova for the model on its own or betwen two models to see # how much additional explantory power you get with the added # variables, and plots to look at residuals, qqplot, and hist of residuals # Currently can't do anova or lowess in python, and the qqplots are annoying # to make. # Initial model, only look at log(hrwage)~sex X1 = hrdat["sex"] == 2 X1 = sm.add_constant(X1, prepend=True) model1 = sm.WLS(np.log(hrdat["hrwage"]), X1, weights=hrdat["A_ERNLWT"]) results1 = model1.fit() print results1.summary() # Pre-defining model matrix components for more complicated models # dat_mat is DATa model MATtrices n = len(hrdat) dat_mat = {} dat_names = {} factor_vars = ["sex", "educ", "PTFT", "ind", "occ", "marstat", "GEDIV", "race", "hispanic", "disabled"] for name in factor_vars: dat_mat[name], dat_names[name] = sm.categorical(hrdat[name], dictnames=True) dat_mat[name] = dat_mat[name][:, 2:]
def anova_ols(y, x): X = sm.add_constant(data2dummy(x)) res = sm.OLS(y, X).fit() return res.fvalue, res.f_pvalue, res.rsquared, np.sqrt(res.mse_resid)
pred = np.dot(self.wexog, self.coeffs) eps = np.diag((self.wendog - pred) ** 2) sigmaSq = np.sum(eps) pinvX = np.dot(self.rnorm_cov_params, self.wexog.T) self._wncp = np.dot(np.dot(pinvX, eps), pinvX.T) * df / sigmaSq return self._wncp _coeffs = None @property def coeffs(self): """Estimated parameters""" if self._coeffs is None: betaLambda = np.dot(self.inv_rwexog, self.rwendog) self._coeffs = betaLambda[:self.ncoeffs] return self._coeffs def fit(self): rncp = self.wrnorm_cov_params lfit = RegressionResults(self, self.coeffs, normalized_cov_params=rncp) return lfit if __name__=="__main__": import scikits.statsmodels.api as sm dta = np.genfromtxt('./rlsdata.txt', names=True) design = np.column_stack((dta['Y'],dta['Y']**2,dta[['NE','NC','W','S']].view(float).reshape(dta.shape[0],-1))) design = sm.add_constant(design, prepend=True) rls_mod = RLS(dta['G'],design, constr=[0,0,0,1,1,1,1]) rls_fit = rls_mod.fit() print rls_fit.params
import numpy from matplotlib import pyplot as plt from matplotlib import rc import scikits.statsmodels.api as sm from scipy import stats import sim data = numpy.loadtxt("../doc/gelman/ARM_Data/arsenic/wells.dat", usecols=(1, 2, 3, 4, 5), skiprows=1) exog = data[:, 2] endog = data[:, 0] exog = sm.add_constant(exog, prepend=True) logit_mod = sm.Logit(endog, exog) logit_res = logit_mod.fit() [beta, sigma] = sim.sim_glm(logit_res, 1000) print numpy.mean(beta[:, 0]) print numpy.mean(beta[:, 1]) plt.plot(beta[:, 0], beta[:, 1], '.') plt.xlabel('beta_0') plt.ylabel('beta_1') plt.show()
pl.vlines([pooled], -.75, len(n), linewidth=1, linestyle='dashed', color='k') pl.axis([-.01, 1.05, -.75, .25 + .5 * len(n)]) pl.text(-2 * xmax / 50, -.5, 'Pooled Estimate', ha='right', va='center') pl.title('North Africa/Middle East') pl.savefig('vzv_forest.pdf') ### @export 'OLS' pl.figure() import scikits.statsmodels.api as sm Y = df['Parameter Value'].__array__() X = .5 * (df['Age Start'] + df['Age End']).__array__() pl.plot(X, Y, 'ks', label='Observed', mec='w', mew=1) XX = sm.add_constant(X) X_pred = pl.arange(65) XX_pred = sm.add_constant(X_pred) model = sm.OLS(Y, XX) results = model.fit() Y_pred = model.predict(XX_pred) pl.plot(X_pred, Y_pred, 'k-', linewidth=2, label='Predicted by OLS') Y = mc.logit(df['Parameter Value'].__array__()) model = sm.OLS(Y, XX) results = model.fit() Y_pred = model.predict(XX_pred) pl.plot(X_pred,
""" Examples: statsmodels.models.RLM Notes ----- The syntax for the arguments will be shortened to accept string arguments in the future. """ import scikits.statsmodels.api as sm ### Example for using Huber's T norm with the default ### median absolute deviation scaling data = sm.datasets.stackloss.load() data.exog = sm.add_constant(data.exog) huber_t = sm.RLM(data.endog, data.exog, M=sm.robust.norms.HuberT()) hub_results = huber_t.fit() print hub_results.params print hub_results.bse ### or with the 'H2' covariance matrix hub_results2 = huber_t.fit(cov="H2") print hub_results2.params print hub_results2.bse ### Example for using Andrew's Wave norm with ### Huber's Proposal 2 scaling and 'H3' covariance matrix andrew_mod = sm.RLM(data.endog, data.exog, M=sm.robust.norms.AndrewWave()) andrew_results = andrew_mod.fit(scale_est=sm.robust.scale.HuberScale(), cov="H3")
was created in a very ad hoc manner and due to the idiosyncracies in R it does not work for all types of R models. There are also R scripts included with most of the datasets to run some basic models for comparisons of results to statsmodels. ''' from rpy import r import numpy as np import scikits.statsmodels.api as sm examples = [1, 2] if 1 in examples: data = sm.datasets.longley.load() y, x = data.endog, sm.add_constant(data.exog) des_cols = ['x.%d' % (i + 1) for i in range(x.shape[1])] formula = r('y~%s-1' % '+'.join(des_cols)) frame = r.data_frame(y=y, x=x) results = r.lm(formula, data=frame) print results.keys() print results['coefficients'] if 2 in examples: data2 = sm.datasets.star98.load() y2, x2 = data2.endog, sm.add_constant(data2.exog) import rpy y2 = y2[:, 0] / y2.sum(axis=1) des_cols2 = ['x.%d' % (i + 1) for i in range(x2.shape[1])] formula2 = r('y~%s-1' % '+'.join(des_cols2)) frame2 = r.data_frame(y=y2, x=x2)