def pacf_ols(x, nlags=40): '''Calculate partial autocorrelations Parameters ---------- x : 1d array observations of time series for which pacf is calculated nlags : int Number of lags for which pacf is returned. Lag 0 is not returned. Returns ------- pacf : 1d array partial autocorrelations, maxlag+1 elements Notes ----- This solves a separate OLS estimation for each desired lag. ''' #TODO: add warnings for Yule-Walker #NOTE: demeaning and not using a constant gave incorrect answers? #JP: demeaning should have a better estimate of the constant #maybe we can compare small sample properties with a MonteCarlo xlags = lagmat(x, nlags) x0 = xlags[:,0] xlags = xlags[:,1:] #xlags = sm.add_constant(lagmat(x, nlags), prepend=True) xlags = sm.add_constant(xlags, prepend=True) pacf = [1.] for k in range(1, nlags+1): res = sm.OLS(x0[k:], xlags[k:,:k+1]).fit() #np.take(xlags[k:], range(1,k+1)+[-1], pacf.append(res.params[-1]) return np.array(pacf)
def __init__(self): from results.results_discrete import Spector data = sm.datasets.spector.load() data.exog = sm.add_constant(data.exog) self.data = data self.res1 = Logit(data.endog, data.exog).fit(method="newton", disp=0) res2 = Spector() res2.logit() self.res2 = res2
def __init__(self): from results.results_discrete import RandHIE data = sm.datasets.randhie.load() nobs = len(data.endog) exog = sm.add_constant(data.exog.view(float).reshape(nobs,-1)) self.res1 = Poisson(data.endog, exog).fit(method='newton', disp=0) res2 = RandHIE() res2.poisson() self.res2 = res2
def __init__(self): from results.results_discrete import Anes data = sm.datasets.anes96.load() exog = data.exog exog[:,0] = np.log(exog[:,0] + .1) exog = np.column_stack((exog[:,0],exog[:,2], exog[:,5:8])) exog = sm.add_constant(exog) self.res1 = MNLogit(data.endog, exog).fit(method="newton", disp=0) res2 = Anes() res2.mnlogit_basezero() self.res2 = res2
def checkOLS(self, exog, endog, x, y): reference = sm.OLS(endog, sm.add_constant(exog)).fit() result = ols(y=y, x=x) assert_almost_equal(reference.params, result._beta_raw) assert_almost_equal(reference.df_model, result._df_model_raw) assert_almost_equal(reference.df_resid, result._df_resid_raw) assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) assert_almost_equal(reference.pvalues, result._p_value_raw) assert_almost_equal(reference.rsquared, result._r2_raw) assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) assert_almost_equal(reference.resid, result._resid_raw) assert_almost_equal(reference.bse, result._std_err_raw) assert_almost_equal(reference.t(), result._t_stat_raw) assert_almost_equal(reference.cov_params(), result._var_beta_raw) assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) _check_non_raw_results(result)
def checkOLS(self, exog, endog, x, y): reference = sm.OLS(endog, sm.add_constant(exog)).fit() result = ols(y=y, x=x) assert_almost_equal(reference.params, result._beta_raw) assert_almost_equal(reference.df_model, result._df_model_raw) assert_almost_equal(reference.df_resid, result._df_resid_raw) assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) assert_almost_equal(reference.pvalues, result._p_value_raw) assert_almost_equal(reference.rsquared, result._r2_raw) assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) assert_almost_equal(reference.resid, result._resid_raw) assert_almost_equal(reference.bse, result._std_err_raw) assert_almost_equal(reference.t(), result._t_stat_raw) assert_almost_equal(reference.cov_params(), result._var_beta_raw) assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) _check_non_raw_results(result)
import scikits.statsmodels as sm import numpy.lib.recfunctions as nprf data = sm.datasets.grunfeld.Load() # Baltagi doesn't include American Steel endog = data.endog[:-20] fullexog = data.exog[:-20] # fullexog.sort(order=['firm','year']) panel_arr = nprf.append_fields(fullexog, 'investment', endog, float, usemask=False) panel_panda = LongPanel.fromRecords(panel_arr, major_field='year', minor_field='firm') # the most cumbersome way of doing it as far as preprocessing by hand exog = fullexog[['value','capital']].view(float).reshape(-1,2) exog = sm.add_constant(exog) panel = group(fullexog['firm']) year = fullexog['year'] panel_mod = PanelModel(endog, exog, panel, year, xtnames=['firm','year'], equation='invest value capital') # note that equation doesn't actually do anything but name the variables panel_ols = panel_mod.fit(model='pooled') panel_be = panel_mod.fit(model='between', effects='oneway') panel_fe = panel_mod.fit(model='fixed', effects='oneway') panel_bet = panel_mod.fit(model='between', effects='time') panel_fet = panel_mod.fit(model='fixed', effects='time') panel_fe2 = panel_mod.fit(model='fixed', effects='twoways')
f2xcoef = np.array([[ 0.1, 3., 1., 0.], [ 0., 0., 1.5, 0.1], [ 3., 2., 1., 0.]]) x0 = np.dot(f0, f2xcoef) x0 += 0.1*np.random.normal(size=x0.shape) ytrue = np.dot(f0,[1., 1., 1.]) y0 = ytrue + 0.1*np.random.normal(size=ytrue.shape) xred, fact, eva, eve = pca(x0, keepdim=0) print eve print fact[:5] print f0[:5] import scikits.statsmodels as sm res = sm.OLS(y0, sm.add_constant(x0)).fit() print 'OLS on original data' print res.params print res.aic print res.rsquared #print 'OLS on Factors' #for k in range(x0.shape[1]): # xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1) # fact_wconst = sm.add_constant(fact) # res = sm.OLS(y0, fact_wconst).fit() # print 'k =', k # print res.params # print 'aic: ', res.aic # print 'bic: ', res.bic # print 'llf: ', res.llf
# ndts=np.column_stack(dts[col] for col in dts.dtype.names) # ntda=ntds.swapaxis(1,0) # ntda is ntds returns false? # or now we just have detailed information about the different strings # would this approach ever be inappropriate for a string typed variable # other than dates? # descstats(ndts, [1]) # raw_input("Enter to try second part") # descstats(ndts, [1,20,3]) if __name__ == '__main__': import scikits.statsmodels as sm import os data = sm.datasets.longley.Load() data.exog = sm.add_constant(data.exog) sum1 = descstats(data.exog) sum1a = descstats(data.exog[:,:1]) # loc='http://eagle1.american.edu/~js2796a/data/handguns_data.csv' # dta=np.recfromcsv(loc) # summary2 = descstats(dta,['stpop']) # summary3 = descstats(dta,['stpop','avginc','vio']) #TODO: needs a by argument # summary4 = descstats(dta) this fails # this is a bug # p = dta[['stpop']] # p.view(dtype = np.float, type = np.ndarray) # this works # p.view(dtype = np.int, type = np.ndarray)
firms = ['General Motors', 'Chrysler', 'General Electric', 'Westinghouse', 'US Steel'] grun_exog = grun_data.exog grun_endog = grun_data.endog # Right now takes SUR takes a list of arrays # The array alternates between the LHS of an equation and RHS side of an # equation # This is very likely to change grun_sys = [] for i in firms: index = grun_exog['firm'] == i grun_sys.append(grun_endog[index]) exog = grun_exog[index][['value','capital']].view(float).reshape(-1,2) exog = sm.add_constant(exog, prepend=True) grun_sys.append(exog) # Note that the results in Greene (5th edition) uses a slightly different # version of the Grunfeld data. To reproduce Table 14.1 the following changes # are necessary. grun_sys[-2][5] = 261.6 grun_sys[-2][-3] = 645.2 grun_sys[-1][11,2] = 232.6 grun_mod = SUR(grun_sys) grun_res = grun_mod.fit() print "Results for the 2-step GLS" print "Compare to Greene Table 14.1, 5th edition" print grun_res.params # or you can do an iterative fit
""" Example: scikis.statsmodels.GLS """ import scikits.statsmodels as sm import numpy as np data = sm.datasets.longley.Load() data.exog = sm.add_constant(data.exog) # The Longley dataset is a time series dataset # Let's assume that the data is heteroskedastic and that we know # the nature of the heteroskedasticity. We can then define # `sigma` and use it to give us a GLS model # First we will obtain the residuals from an OLS fit ols_resid = sm.OLS(data.endog, data.exog).fit().resid # Assume that the error terms follow an AR(1) process with a trend # resid[i] = beta_0 + rho*resid[i-1] + e[i] # where e ~ N(0,some_sigma**2) # and that rho is simply the correlation of the residuals # a consistent estimator for rho is to regress the residuals # on the lagged residuals resid_fit = sm.OLS(ols_resid[1:], sm.add_constant(ols_resid[:-1])).fit() print resid_fit.t(0) print resid_fit.pvalues[0] # While we don't have strong evidence that the errors follow an AR(1) # process we continue
Created on Thu Mar 25 22:56:45 2010 Author: josef-pktd """ import numpy as np from numpy.testing import assert_almost_equal import scikits.statsmodels as sm np.random.seed(87654589) nobs = 10 #100 x1 = np.random.randn(nobs) y1 = 10 + 15*x1 + 2*np.random.randn(nobs) x1 = sm.add_constant(x1) #, prepend=True) assert_almost_equal(x1, np.vander(x1[:,0],2), 16) res1 = sm.OLS(y1, x1).fit() print res1.params print np.polyfit(x1[:,0], y1, 1) assert_almost_equal(res1.params, np.polyfit(x1[:,0], y1, 1), 14) print res1.summary(xname=['x1','const1']) #regression 2 x2 = np.random.randn(nobs) y2 = 19 + 17*x2 + 2*np.random.randn(nobs) #y2 = 10 + 15*x2 + 2*np.random.randn(nobs) # if H0 is true x2 = sm.add_constant(x2) #, prepend=True) assert_almost_equal(x2, np.vander(x2[:,0],2), 16)
import numpy as np import scikits.statsmodels as sm data = np.loadtxt("burglary.txt", skiprows=1, usecols = (1,2)) exog = data[:,1] endog = data[:,0] endog1 = endog[endog > 0] exog1 = exog[endog > 0] exog1 = sm.add_constant(exog1, prepend=True) glm = sm.GLM(endog1, exog1, family=sm.family.Poisson()) res = glm.fit() print "res.deviance=" + str(res.deviance) print "res.scale=" + str(res.scale) print "res.params=" + str(res.params) print "res.pearson_chi2=" + str(res.pearson_chi2) print "res.df_model=" + str(res.df_model) print "res.null_deviance=" + str(res.null_deviance) print "res.t()=" + str(res.t()) print "\n" exog = sm.add_constant(exog, prepend=True) glm = sm.GLM(endog, exog, family=sm.family.NegativeBinomial()) res = glm.fit() print "res.deviance=" + str(res.deviance)
# The proportion of low income families "LOWINC" # The proportions of minority students,"PERASIAN","PERBLACK","PERHISP" # The percentage of minority teachers "PERMINTE", # The median teacher salary including benefits in 1000s "AVSALK" # The mean teacher experience in years "AVYRSEXP", # The per-pupil expenditures in thousands "PERSPENK" # The parent-teacher ratio "PTRATIO" # The percent of students taking college credit courses "PCTAF", # The percentage of charter schools in the districut "PCTCHRT" # The percent of schools in the district operating year round "PCTYRRND" # The following are interaction terms "PERMINTE_AVYRSEXP","PERMINTE_AVSAL", # "AVYRSEXP_AVSAL","PERSPEN_PTRATIO","PERSPEN_PCTAF","PTRATIO_PCTAF", # "PERMINTE_AVYRSEXP_AVSAL","PERSPEN_PTRATIO_PCTAF" data = sm.datasets.star98.Load() data.exog = sm.add_constant(data.exog) print """The response variable is (success, failure). Eg., the first observation is """, data.endog[0] print"""Giving a total number of trials for this observation of """, data.endog[0].sum() glm_binom = sm.GLM(data.endog, data.exog, family=sm.family.Binomial()) ### In order to fit this model, you must (for now) specify the number of ### trials per observation ie., success + failure ### This is the only time the data_weights argument should be used. trials = data.endog.sum(axis=1) binom_results = glm_binom.fit(data_weights=trials) print """The fitted values are
"""Example: scikits.statsmodels.discretemod """ import numpy as np import scikits.statsmodels as sm # load the data from Spector and Mazzeo (1980) # Examples follow Greene's Econometric Analysis Ch. 21 (5th Edition). spector_data = sm.datasets.spector.load() spector_data.exog = sm.add_constant(spector_data.exog) # Linear Probability Model using OLS lpm_mod = sm.OLS(spector_data.endog,spector_data.exog) lpm_res = lpm_mod.fit() # Logit Model logit_mod = sm.Logit(spector_data.endog, spector_data.exog) logit_res = logit_mod.fit() # Probit Model probit_mod = sm.Probit(spector_data.endog, spector_data.exog) probit_res = probit_mod.fit() print "This example is based on Greene Table 21.1 5th Edition" print "Linear Model" print lpm_res.params print "Logit Model" print logit_res.params print "Probit Model" print probit_res.params #print "Typo in Greene for Weibull, replaced with logWeibull or Gumbel"
def grangercausalitytests(x, maxlag): '''four tests for granger causality of 2 timeseries this is a proof-of concept implementation not cleaned up, has some duplicate calculations, memory intensive - builds full lag array for variables prints results not verified with other packages, all four tests give similar results (1 and 4 identical) Parameters ---------- x : array, 2d, (nobs,2) data for test whether the time series in the second column Granger causes the time series in the first column maxlag : integer the Granger causality test results are calculated for all lags up to maxlag Returns ------- None : no returns all test results are currently printed Notes ----- TODO: convert to function that returns and compare with other packages ''' from scipy import stats # lazy import import scikits.statsmodels as sm # absolute import for now for mlg in range(1, maxlag+1): print '\nGranger Causality' print 'number of lags (no zero)', mlg mxlg = mlg + 1 # Note number of lags starting at zero in lagmat # create lagmat of both time series dta = lagmat2ds(x, mxlg, trim='both', dropex=1) #add constant dtaown = sm.add_constant(dta[:,1:mxlg]) dtajoint = sm.add_constant(dta[:,1:]) #run ols on both models without and with lags of second variable res2down = sm.OLS(dta[:,0], dtaown).fit() res2djoint = sm.OLS(dta[:,0], dtajoint).fit() #print results #for ssr based tests see: http://support.sas.com/rnd/app/examples/ets/granger/index.htm #the other tests are made-up # Granger Causality test using ssr (F statistic) fgc1 = (res2down.ssr-res2djoint.ssr)/res2djoint.ssr/(mxlg-1)*res2djoint.df_resid print 'ssr based F test: F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \ (fgc1, stats.f.sf(fgc1, mxlg-1, res2djoint.df_resid), res2djoint.df_resid, mxlg-1) # Granger Causality test using ssr (ch2 statistic) fgc2 = res2down.nobs*(res2down.ssr-res2djoint.ssr)/res2djoint.ssr print 'ssr based chi2 test: chi2=%-8.4f, p=%-8.4f, df=%d' % \ (fgc2, stats.chi2.sf(fgc2, mxlg-1), mxlg-1) #likelihood ratio test pvalue: lr = -2*(res2down.llf-res2djoint.llf) print 'likelihood ratio test: chi2=%-8.4f, p=%-8.4f, df=%d' % \ (lr, stats.chi2.sf(lr, mxlg-1), mxlg-1) # F test that all lag coefficients of exog are zero rconstr = np.column_stack((np.zeros((mxlg-1,mxlg-1)), np.eye(mxlg-1, mxlg-1),\ np.zeros((mxlg-1, 1)))) ftres = res2djoint.f_test(rconstr) print 'parameter F test: F=%-8.4f, p=%-8.4f, df_denom=%d, df_num=%d' % \ (ftres.fvalue, ftres.pvalue, ftres.df_denom, ftres.df_num)
def anova_ols(y, x): X = sm.add_constant(data2dummy(x)) res = sm.OLS(y, X).fit() return res.fvalue, res.f_pvalue, res.rsquared, np.sqrt(res.mse_resid)
def add_trend(X, trend="c", prepend=False): """ Adds a trend and/or constant to an array. Parameters ---------- X : array-like Original array of data. trend : str {"c","ct","ctt"} "c" add constant only "t" add trend only "ct" add constant and linear trend "ctt" add constant and linear and quadratic trend. prepend : bool If True, prepends the new data to the columns of X. Notes ----- Returns columns as ["ctt","ct","c"] whenever applicable. There is currently no checking for an existing constant or trend. See also -------- scikits.statsmodels.add_constant """ #TODO: could be generalized for trend of aribitrary order trend = trend.lower() if trend == "c": # handles structured arrays return sm.add_constant(X, prepend=prepend) elif trend == "ct" or trend == "t": trendorder = 1 elif trend == "ctt": trendorder = 2 else: raise ValueError("trend %s not understood") % trend X = np.asanyarray(X) nobs = len(X) trendarr = np.vander(np.arange(1,nobs+1, dtype=float), trendorder+1) if trend == "t": trendarr = trendarr[:,0] if not X.dtype.names: if not prepend: X = np.column_stack((X, trendarr)) else: X = np.column_stack((trendarr, X)) else: return_rec = data.__clas__ is np.recarray if trendorder == 1: if trend == "ct": dt = [('trend',float),('const',float)] else: dt = [('trend', float)] elif trendorder == 2: dt = [('trend_squared', float),('trend',float),('const',float)] trendarr = trendarr.view(dt) if prepend: X = nprf.append_fields(trendarr, X.dtype.names, [X[i] for i in data.dtype.names], usemask=False, asrecarray=return_rec) else: X = nprf.append_fields(X, trendarr.dtype.names, [trendarr[i] for i in trendarr.dtype.names], usemask=false, asrecarray=return_rec) return X
it does not work for all types of R models. There are also R scripts included with most of the datasets to run some basic models for comparisons of results to statsmodels. ''' from rpy import r import numpy as np import scikits.statsmodels as sm examples = [1, 2] if 1 in examples: data = sm.datasets.longley.load() y,x = data.endog, sm.add_constant(data.exog) des_cols = ['x.%d' % (i+1) for i in range(x.shape[1])] formula = r('y~%s-1' % '+'.join(des_cols)) frame = r.data_frame(y=y, x=x) results = r.lm(formula, data=frame) print results.keys() print results['coefficients'] if 2 in examples: data2 = sm.datasets.star98.load() y2,x2 = data2.endog, sm.add_constant(data2.exog) import rpy y2 = y2[:,0]/y2.sum(axis=1) des_cols2 = ['x.%d' % (i+1) for i in range(x2.shape[1])] formula2 = r('y~%s-1' % '+'.join(des_cols2)) frame2 = r.data_frame(y=y2, x=x2)
""" import numpy as np import numpy.testing as npt from scipy import signal import scikits.statsmodels as sm from scikits.statsmodels.regression import GLSAR, yule_walker examples_all = range(10) + ["test_copy"] examples = examples_all # [5] if 0 in examples: print "\n Example 0" X = np.arange(1, 8) X = sm.add_constant(X) Y = np.array((1, 3, 4, 5, 8, 10, 9)) rho = 2 model = GLSAR(Y, X, 2) for i in range(6): results = model.fit() print "AR coefficients:", model.rho rho, sigma = yule_walker(results.resid, order=model.order) model = GLSAR(Y, X, rho) par0 = results.params print par0 model0if = GLSAR(Y, X, 2) res = model0if.iterative_fit(6) print "iterativefit beta", res.params results.t() # is this correct? it does equal params/bse