def run(self, results_x, results_z, attach=True): ''' see class docstring (for now) ''' if not np.allclose(results_x.model.endog, results_z.model.endog): raise ValueError('endogenous variables in models are not the same') nobs = results_x.model.endog.shape[0] x = results_x.model.exog z = results_z.model.exog sigma2_x = results_x.ssr / nobs sigma2_z = results_z.ssr / nobs yhat_x = results_x.fittedvalues yhat_z = results_z.fittedvalues res_dx = sm.OLS(yhat_x, z).fit() err_zx = res_dx.resid res_xzx = sm.OLS(err_zx, x).fit() err_xzx = res_xzx.resid sigma2_zx = sigma2_x + np.dot(err_zx.T, err_zx) / nobs c01 = nobs / 2. * (np.log(sigma2_z) - np.log(sigma2_zx)) v01 = sigma2_x * np.dot(err_xzx.T, err_xzx) / sigma2_zx**2 q = c01 / np.sqrt(v01) pval = 2 * stats.norm.sf(np.abs(q)) if attach: self.res_dx = res_dx self.res_xzx = res_xzx self.c01 = c01 self.v01 = v01 self.q = q self.pvalue = pval self.dist = stats.norm return q, pval
def het_white(y, x, retres=False): '''Lagrange Multiplier Heteroscedasticity Test by White Notes ----- assumes x contains constant (for counting dof) question: does f-statistic make sense? constant ? References ---------- Greene section 11.4.1 5th edition p. 222 ''' x = np.asarray(x) y = np.asarray(y)**2 if x.ndim == 1: raise ValueError( 'x should have constant and at least one more variable') nobs, nvars0 = x.shape i0, i1 = np.triu_indices(nvars0) exog = x[:, i0] * x[:, i1] nobs, nvars = exog.shape assert nvars == nvars0 * (nvars0 - 1) / 2. + nvars0 resols = sm.OLS(y**2, exog).fit() fval = resols.fvalue fpval = resols.f_pvalue lm = nobs * resols.rsquared # Note: degrees of freedom for LM test is nvars minus constant lmpval = stats.chi2.sf(lm, nvars - 1) return lm, lmpval, fval, fpval
def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, nw_overlap=False): import scikits.statsmodels.api as sm self._x_orig = x self._y_orig = y self._weights_orig = weights self._intercept = intercept self._nw_lags = nw_lags self._nw_overlap = nw_overlap (self._y, self._x, self._weights, self._x_filtered, self._index, self._time_has_obs) = self._prepare_data() if self._weights is not None: self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0) self._y_trans = self._y * np.sqrt(self._weights) self.sm_ols = sm.WLS(self._y.values, self._x.values, weights=self._weights.values).fit() else: self._x_trans = self._x self._y_trans = self._y self.sm_ols = sm.OLS(self._y.values, self._x.values).fit()
def run(self, results_x, results_z, attach=True): ''' see class docstring (for now) ''' if not np.allclose(results_x.model.endog, results_z.model.endog): raise ValueError('endogenous variables in models are not the same') nobs = results_x.model.endog.shape[0] y = results_x.model.exog x = results_x.model.exog z = results_z.model.exog #sigma2_x = results_x.ssr/nobs #sigma2_z = results_z.ssr/nobs yhat_x = results_x.fittedvalues #yhat_z = results_z.fittedvalues res_zx = sm.OLS(y, np.column_stack((yhat_x, z))).fit() self.res_zx = res_zx #for testing tstat = res_zx.tvalues[0] pval = res_zx.pvalues[0] if attach: self.res_zx = res_zx self.dist = stats.t(res_zx.model.df_resid) self.teststat = tstat self.pvalue = pval return tsta, pval
def checkOLS(self, exog, endog, x, y): try: import scikits.statsmodels.api as sm except ImportError: import scikits.statsmodels as sm reference = sm.OLS(endog, sm.add_constant(exog)).fit() result = ols(y=y, x=x) assert_almost_equal(reference.params, result._beta_raw) assert_almost_equal(reference.df_model, result._df_model_raw) assert_almost_equal(reference.df_resid, result._df_resid_raw) assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) assert_almost_equal(reference.pvalues, result._p_value_raw) assert_almost_equal(reference.rsquared, result._r2_raw) assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) assert_almost_equal(reference.resid, result._resid_raw) assert_almost_equal(reference.bse, result._std_err_raw) assert_almost_equal(reference.t(), result._t_stat_raw) assert_almost_equal(reference.cov_params(), result._var_beta_raw) assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) _check_non_raw_results(result)
def test_qqplot(self): #just test that it runs data = sm.datasets.longley.load() data.exog = sm.add_constant(data.exog) mod_fit = sm.OLS(data.endog, data.exog).fit() res = mod_fit.resid fig = sm.qqplot(res) plt.close(fig)
def setup(self): nsample = 100 sig = 0.5 x1 = np.linspace(0, 20, nsample) x2 = 5 + 3 * np.random.randn(nsample) X = np.c_[x1, x2, np.sin(0.5 * x1), (x2 - 5)**2, np.ones(nsample)] beta = [0.5, 0.5, 1, -0.04, 5.] y_true = np.dot(X, beta) y = y_true + sig * np.random.normal(size=nsample) exog0 = sm.add_constant(np.c_[x1, x2], prepend=False) res = sm.OLS(y, exog0).fit() self.res = res
def regression_analysis(play_arr, dataFunction1, dataFunction2): totalBefore = [] totalAfter = [] for weekNum in range(10, 15): Before, After = regression_weekly(play_arr, weekNum, dataFunction1, dataFunction2) totalBefore = np.concatenate([totalBefore, Before]) totalAfter = np.concatenate([totalAfter, After]) slope, intercept, r_value, p_value, err = stats.linregress( totalBefore, totalAfter) results = sm.OLS(totalAfter, sm.add_constant(totalBefore)).fit() print results.summary() plt.plot(totalBefore, totalAfter, '.') X_plot = np.linspace(0, 1, 100) plt.plot(X_plot, X_plot * results.params[0] + results.params[1]) plt.show()
def checkOLS(self, exog, endog, x, y): reference = sm.OLS(endog, sm.add_constant(exog, prepend=False)).fit() result = ols(y=y, x=x) # check that sparse version is the same sparse_result = ols(y=y.to_sparse(), x=x.to_sparse()) _compare_ols_results(result, sparse_result) assert_almost_equal(reference.params, result._beta_raw) assert_almost_equal(reference.df_model, result._df_model_raw) assert_almost_equal(reference.df_resid, result._df_resid_raw) assert_almost_equal(reference.fvalue, result._f_stat_raw[0]) assert_almost_equal(reference.pvalues, result._p_value_raw) assert_almost_equal(reference.rsquared, result._r2_raw) assert_almost_equal(reference.rsquared_adj, result._r2_adj_raw) assert_almost_equal(reference.resid, result._resid_raw) assert_almost_equal(reference.bse, result._std_err_raw) assert_almost_equal(reference.tvalues, result._t_stat_raw) assert_almost_equal(reference.cov_params(), result._var_beta_raw) assert_almost_equal(reference.fittedvalues, result._y_fitted_raw) _check_non_raw_results(result)
def fit_fixed_nfact(self, nfact): if not hasattr(self, 'factors_wconst'): self.calc_factors() return sm.OLS(self.endog, self.factors[:, :nfact + 1]).fit()
# OLS non-linear curve but linear in parameters # --------------------------------------------- nsample = 50 sig = 0.5 x1 = np.linspace(0, 20, nsample) X = np.c_[x1, np.sin(x1), (x1 - 5)**2, np.ones(nsample)] beta = [0.5, 0.5, -0.02, 5.] y_true = np.dot(X, beta) y = y_true + sig * np.random.normal(size=nsample) plt.figure() plt.plot(x1, y, 'o', x1, y_true, 'b-') res = sm.OLS(y, X).fit() print res.params print res.bse #current bug predict requires call to model.results #print res.model.predict prstd, iv_l, iv_u = wls_prediction_std(res) plt.plot(x1, res.fittedvalues, 'r--.') plt.plot(x1, iv_u, 'r--') plt.plot(x1, iv_l, 'r--') plt.title('blue: true, red: OLS') print res.summary() #OLS with dummy variables #------------------------
sexdummy = data2dummy(dta_used[:, 1]) factors = ['sex'] for k in factors: varsused[k][0] = data2dummy(varsused[k][0]) products = [('sex', 'age')] for k in products: varsused[''.join(k)] = data2proddummy(np.c_[varsused[k[0]][0], varsused[k[1]][0]]) # make dictionary of variables with dummies as one variable #vars_to_use = {name: data or dummy variables} X_b0 = np.c_[sexdummy, dta_used[:, 2], np.ones((dta_used.shape[0], 1))] y_b0 = dta_used[:, 0] res_b0 = sm.OLS(y_b0, X_b0).results print res_b0.params print res_b0.ssr anova_str0 = ''' ANOVA statistics (model sum of squares excludes constant) Source DF Sum Squares Mean Square F Value Pr > F Model %(df_model)i %(ess)f %(mse_model)f %(fvalue)f %(f_pvalue)f Error %(df_resid)i %(ssr)f %(mse_resid)f CTotal %(nobs)i %(uncentered_tss)f %(mse_total)f R squared %(rsquared)f ''' anova_str = ''' ANOVA statistics (model sum of squares includes constant)
if __name__ == '__main__': #A: josef-pktd import scikits.statsmodels.api as sm from scikits.statsmodels.api import OLS #from scikits.statsmodels.datasets.longley import load from scikits.statsmodels.datasets.stackloss import load from scikits.statsmodels.iolib.table import (SimpleTable, default_txt_fmt, default_latex_fmt, default_html_fmt) import numpy as np data = load() data.exog = sm.tools.add_constant(data.exog) resols = sm.OLS(data.endog, data.exog).fit() print '\n OLS leave 1 out' for inidx, outidx in cross_val.LeaveOneOut(len(data.endog)): res = sm.OLS(data.endog[inidx], data.exog[inidx, :]).fit() print data.endog[outidx], res.model.predict(res.params, data.exog[outidx, :]), print data.endog[outidx] - res.model.predict(res.params, data.exog[outidx, :]) print '\n OLS leave 2 out' resparams = [] for inidx, outidx in cross_val.LeavePOut(len(data.endog), 2): res = sm.OLS(data.endog[inidx], data.exog[inidx, :]).fit() #print data.endog[outidx], res.model.predict(data.exog[outidx,:]), #print ((data.endog[outidx] - res.model.predict(data.exog[outidx,:]))**2).sum()
#0: no rescaling, 1:demean, 2:standardize, 3:standardize and transform back rescale_ratio = data.endog.std() / data.exog.std(0) if rescale > 0: # rescaling data.endog -= data.endog.mean() data.exog -= data.exog.mean(0) if rescale > 1: data.endog *= 1. / data.endog.std() #data.exog *= 1000./data.exog.var(0) data.exog /= data.exog.std(0) #rescale_ratio = data.exog.var(0)/data.endog.var() #skip because mean has been removed, but dimension is hardcoded in table data.exog = sm.tools.add_constant(data.exog) ols_model = sm.OLS(data.endog, data.exog) ols_results = ols_model.fit() # the Longley dataset is well known to have high multicollinearity # one way to find the condition number is as follows #Find OLS parameters for model with one explanatory variable dropped resparams = np.nan * np.ones((7, 7)) res = sm.OLS(data.endog, data.exog).fit() resparams[:, 0] = res.params indall = range(7) for i in range(6): ind = indall[:] del ind[i]
def inverse_model(sensor, num_sensors, inputcloudinessbins, num_bins, angle): #Get zone orientation and time for which the zone faces direct sun directsun = facadeorientation.orientation() zone_orientation = directsun[1] starttime = math.floor(float(directsun[2]) + float(directsun[3]) / 60) endtime = math.ceil(float(directsun[5]) + float(directsun[6]) / 60) #Get data from database connection = sqlite3.connect('data.db') cursor = connection.cursor() #define dicts to save data win_daylight = dict() win_sunangle = dict() win_hours = dict() newwin_daylight = dict() sensors = num_sensors cloudinessbins = inputcloudinessbins bins = num_bins cloudy = [ 'Clear', 'Partly Cloudy', 'Scattered Clouds', 'Mostly Cloudy', 'Light Rain', 'Rain', 'Overcast', 'Heavy Rain', 'Fog', 'Haze' ] #number of cloudiness could be user defined''' n = 0 getcloudiness = [] getclouds = [] while n + len(cloudy) / int(cloudinessbins) <= len(cloudy): cloudset = cloudy[n:n + len(cloudy) / int(cloudinessbins)] getclouds.append(cloudset) n = n + len(cloudy) / int(cloudinessbins) #for sensor in range(2,int(sensors)+2,1): #get values for each sensor in the room table = 'light' + str(sensor) daylight = dict() worklight = dict() sunangle = dict() hours = dict() newdaylight = dict() newsunangle = dict() coeffam = dict() constantam = dict() rvalueam = dict() sunangles = dict() sunanglerange = dict() for clouds in range(len(getclouds)): #for each level of cloudiness win_daylight[clouds] = [] win_sunangle[clouds] = [] win_hours[clouds] = [] hours[clouds] = [] newwin_daylight[clouds] = [] newdaylight[clouds] = [] newsunangle[clouds] = [] daylight[clouds] = [] sunangle[clouds] = [] sunanglerange[clouds] = [] sunangles[clouds] = [] clouded = getclouds[clouds] getcloudiness.append(clouded) y1 = [] y2 = [] #print clouded for count in range(len(clouded)): cursor.execute( 'SELECT altitude, hour FROM light1 WHERE unixtime>=1.362175776e+12 AND unixtime<=1.362729428e+12 AND hour>=%s AND hour<=%s AND cloudiness="%s"' % (starttime, endtime, clouded[count])) y1.append(cursor.fetchall()) n = 0 while n <= len(cloudy) / int(cloudinessbins) - 1: for count in y1[n]: altitude = float(count[0]) hour = int(count[1]) if altitude >= 0: win_sunangle[clouds].append(altitude) win_hours[clouds].append(hour) n += 1 for count in range(len(clouded)): cursor.execute( 'SELECT altitude, hour FROM %s WHERE unixtime>=1.362175776e+12 AND unixtime<=1.362729428e+12 AND hour>=%s AND hour<=%s AND cloudiness="%s"' % (table, starttime, endtime, clouded[count])) y2.append(cursor.fetchall()) n = 0 while n <= len(cloudy) / int(cloudinessbins) - 1: for count in y2[n]: altitude = float(count[0]) hour = int(count[1]) if altitude >= 0: sunangle[clouds].append(altitude) hours[clouds].append(hour) n += 1 if len(sunangle[clouds]) > len(win_sunangle[clouds]): datalength = len(win_sunangle[clouds]) else: datalength = len(sunangle[clouds]) win_daylight[clouds] = dict() daylight[clouds] = dict() coeffam[clouds] = dict() constantam[clouds] = dict() rvalueam[clouds] = dict() if len(sunangle[clouds]) > 1 and len(win_sunangle[clouds]) > 1: if max(sunangle[clouds]) >= max(win_sunangle[clouds]): if min(sunangle[clouds]) >= min(win_sunangle[clouds]): sunanglerange[clouds] = np.arange( math.floor(min(sunangle[clouds])), math.ceil(max(win_sunangle[clouds])), int(bins)) else: sunanglerange[clouds] = np.arange( math.floor(min(win_sunangle[clouds])), math.ceil(max(win_sunangle[clouds])), int(bins)) else: if min(sunangle[clouds]) >= min(win_sunangle[clouds]): sunanglerange[clouds] = np.arange( math.floor(min(sunangle[clouds])), math.ceil(max(sunangle[clouds])), int(bins)) else: sunanglerange[clouds] = np.arange( math.floor(min(win_sunangle[clouds])), math.ceil(max(sunangle[clouds])), int(bins)) #return sunanglerange[clouds] y3 = dict() y4 = dict() #for angle in range(len(sunanglerange[clouds])-1): angle = angle_range y3[angle] = [] for count in range(len(clouded)): cursor.execute( 'SELECT light FROM light1 WHERE unixtime>=1.362175776e+12 AND unixtime<=1.362729428e+12 AND hour>=%s AND hour<=%s AND cloudiness="%s" AND altitude>=%s AND altitude<=%s' % (starttime, endtime, clouded[count], sunanglerange[clouds][angle], sunanglerange[clouds][angle + 1])) y3[angle].append(cursor.fetchall()) #print sunanglerange[clouds][angle],y3[angle],clouded win_daylight[clouds][angle] = [] n = 0 while n <= len(cloudy) / int(cloudinessbins) - 1: for count in y3[angle][n]: if float(count[0]) > 1: actualdaylight = round(float(count[0]), 2) win_daylight[clouds][angle].append(actualdaylight) n += 1 #print len(win_daylight[clouds][angle]),' ',clouded,' ',sunanglerange[clouds][angle] #for angle in range(len(sunanglerange[clouds])-1): y4[angle] = [] for count in range(len(clouded)): cursor.execute( 'SELECT light FROM %s WHERE unixtime>=1.362175776e+12 AND unixtime<=1.362729428e+12 AND hour>=%s AND hour<=%s AND cloudiness="%s" AND altitude>=%s AND altitude<=%s' % (table, starttime, endtime, clouded[count], sunanglerange[clouds][angle], sunanglerange[clouds][angle + 1])) y4[angle].append(cursor.fetchall()) daylight[clouds][angle] = [] coeffam[clouds][angle] = [] constantam[clouds][angle] = [] rvalueam[clouds][angle] = [] n = 0 while n <= len(cloudy) / int(cloudinessbins) - 1: for count in y4[angle][n]: if float(count[0]) > 1: actualworklight = round(float(count[0]), 2) daylight[clouds][angle].append(actualworklight) n += 1 #print len(daylight[clouds][angle]),' ',clouded,' ',sunanglerange[clouds][angle] if len(win_daylight[clouds][angle]) > len(daylight[clouds][angle]): finaldatalength = len(daylight[clouds][angle]) else: finaldatalength = len(win_daylight[clouds][angle]) if finaldatalength > 2: adata = vstack((win_daylight[clouds][angle][0:finaldatalength], daylight[clouds][angle][0:finaldatalength])) realadata = adata.transpose() x = realadata[:, 0] y = realadata[:, 1] X = sm.add_constant(x) model = sm.OLS(y, X).fit() if len(model.params) > 1: coeffam[clouds][angle] = round(model.params[0], 3) constantam[clouds][angle] = round(model.params[1], 3) rvalueam[clouds][angle] = round(model.rsquared, 3) return [ coeffam[clouds][angle], constantam[clouds][angle], rvalueam[clouds][angle], clouded, sunanglerange[clouds][angle] ]
while n<=len(cloudy)/int(cloudinessbins)-1: for count in y4[angle][n]: if float(count[0])>1: actualworklight=round(float(count[0]),2) daylight[clouds][angle].append(actualworklight) n+=1 #print len(daylight[clouds][angle]),' ',clouded,' ',sunanglerange[clouds][angle] if len(win_daylight[clouds][angle])>len(daylight[clouds][angle]): finaldatalength=len(daylight[clouds][angle]) else: finaldatalength=len(win_daylight[clouds][angle]) #filename="coefficients_"+str(sunanglerange[clouds][angle])+'_'+str(clouds)+".txt" filename="coefficients.txt" #savedata=open('C:\Users\chandrayee\Documents\GitHub\sensor-placement\\coefficients1_10_2sun\\'+filename,'a') ##PERFORM OLS if finaldatalength>2: adata=vstack((win_daylight[clouds][angle][0:finaldatalength],daylight[clouds][angle][0:finaldatalength])) realadata=adata.transpose() x=realadata[:,0] y=realadata[:,1] X=sm.add_constant(x) model = sm.OLS(y, X).fit() if len(model.params)>1: coeffam[clouds][angle]=round(model.params[0],3) constantam[clouds][angle]=round(model.params[1],3) rvalueam[clouds][angle]=round(model.rsquared,3) print "The coeff, constant and rvalue are", coeffam[clouds][angle], constantam[clouds][angle], rvalueam[clouds][angle], "for", clouded, "and angle", sunanglerange[clouds][angle] data=str(coeffam[clouds][angle])+'\t'+str(constantam[clouds][angle])+'\t'+str(rvalueam[clouds][angle])+'\t'+str(clouds)+'\t'+str(sunanglerange[clouds][angle])+'\n' #savedata.write(data) #savedata.close()
pl.savefig('vzv_forest.pdf') ### @export 'OLS' pl.figure() import scikits.statsmodels.api as sm Y = df['Parameter Value'].__array__() X = .5 * (df['Age Start'] + df['Age End']).__array__() pl.plot(X, Y, 'ks', label='Observed', mec='w', mew=1) XX = sm.add_constant(X) X_pred = pl.arange(65) XX_pred = sm.add_constant(X_pred) model = sm.OLS(Y, XX) results = model.fit() Y_pred = model.predict(XX_pred) pl.plot(X_pred, Y_pred, 'k-', linewidth=2, label='Predicted by OLS') Y = mc.logit(df['Parameter Value'].__array__()) model = sm.OLS(Y, XX) results = model.fit() Y_pred = model.predict(XX_pred) pl.plot(X_pred, mc.invlogit(Y_pred), 'k--', linewidth=2, label='Predicted by logit-transformed OLS')
def acorr_lm(x, maxlag=None, autolag='AIC', store=False): '''Lagrange Multiplier tests for autocorrelation not checked yet, copied from unitrood_adf with adjustments check array shapes because of the addition of the constant. written/copied without reference This is not Breush-Godfrey. BG adds lags of residual to exog in the design matrix for the auxiliary regression with residuals as endog, see Greene 12.7.1. Notes ----- If x is calculated as y^2 for a time series y, then this test corresponds to the Engel test for autoregressive conditional heteroscedasticity (ARCH). TODO: get details and verify ''' x = np.asarray(x) nobs = x.shape[0] if maxlag is None: #for adf from Greene referencing Schwert 1989 maxlag = 12. * np.power( nobs / 100., 1 / 4.) #nobs//4 #TODO: check default, or do AIC/BIC xdiff = np.diff(x) # xdall = lagmat(x[:-1, None], maxlag, trim='both') nobs = xdall.shape[0] xdall = np.c_[np.ones((nobs, 1)), xdall] xshort = x[-nobs:] if store: resstore = ResultsStore() if autolag: #search for lag length with highest information criteria #Note: I use the same number of observations to have comparable IC results = {} for mlag in range(1, maxlag): results[mlag] = sm.OLS(xshort, xdall[:, :mlag + 1]).fit() if autolag.lower() == 'aic': bestic, icbestlag = max((v.aic, k) for k, v in results.iteritems()) elif autolag.lower() == 'bic': icbest, icbestlag = max((v.bic, k) for k, v in results.iteritems()) else: raise ValueError("autolag can only be None, 'AIC' or 'BIC'") #rerun ols with best ic xdall = lagmat(x[:, None], icbestlag, trim='forward') nobs = xdall.shape[0] xdall = np.c_[np.ones((nobs, 1)), xdall] xshort = x[-nobs:] usedlag = icbestlag else: usedlag = maxlag resols = sm.OLS(xshort, xdall[:, :usedlag + 1]).fit() fval = resols.fvalue fpval = resols.f_pvalue lm = nobs * resols.rsquared lmpval = stats.chi2.sf(lm, usedlag) # Note: degrees of freedom for LM test is nvars minus constant = usedlags return fval, fpval, lm, lmpval if store: resstore.resols = resols resstore.usedlag = usedlag return fval, fpval, lm, lmpval, resstore else: return fval, fpval, lm, lmpval
def loglikeobs(self, params): beta = params[:-1] sigma = params[-1] xb = np.dot(self.exog, beta) return stats.norm.logpdf(self.endog, loc=xb, scale=sigma) mod_norm2 = MygMLE(datal.endog, datal.exog) #res_norm = mod_norm.fit(start_params=np.ones(datal.exog.shape[1]+1), method="nm", maxiter = 500) res_norm2 = mod_norm2.fit(start_params=[1.] * datal.exog.shape[1] + [1], method="nm", maxiter=500) print res_norm2.params res2 = sm.OLS(datal.endog, datal.exog).fit() start_params = np.hstack((res2.params, np.sqrt(res2.mse_resid))) res_norm3 = mod_norm2.fit(start_params=start_params, method="nm", maxiter=500, retall=0) print start_params print res_norm3.params print res2.bse #print res_norm3.bse # not available print 'llf', res2.llf, res_norm3.llf bse = np.sqrt(np.diag(np.linalg.inv(res_norm3.model.hessian( res_norm3.params)))) res_norm3.model.score(res_norm3.params)
import scikits.statsmodels.api as sm from scikits.statsmodels.miscmodels import TLinearModel #Example: #np.random.seed(98765678) nobs = 50 nvars = 6 df = 3 rvs = np.random.randn(nobs, nvars - 1) data_exog = sm.add_constant(rvs) xbeta = 0.9 + 0.1 * rvs.sum(1) data_endog = xbeta + 0.1 * np.random.standard_t(df, size=nobs) print 'variance of endog:', data_endog.var() print 'true parameters:', [0.1] * nvars + [0.9] res_ols = sm.OLS(data_endog, data_exog).fit() print '\nResults with ols' print '----------------' print res_ols.scale print np.sqrt(res_ols.scale) print res_ols.params print res_ols.bse kurt = stats.kurtosis(res_ols.resid) df_fromkurt = 6. / kurt + 4 print 'df_fromkurt from ols residuals', df_fromkurt print stats.t.stats(df_fromkurt, moments='mvsk') print stats.t.stats(df, moments='mvsk') modp = TLinearModel(data_endog, data_exog) start_value = 0.1 * np.ones(data_exog.shape[1] + 2) #start_value = np.zeros(data_exog.shape[1]+2)
x = np.ones((nobs, 2)) x[:, 1] = np.arange(nobs) / 20. y = x.sum(1) + 1.01 * (1 + 1.5 * (x[:, 1] > 10)) * np.random.rand(nobs) print het_goldfeldquandt(y, x, 1) y = x.sum(1) + 1.01 * (1 + 0.5 * (x[:, 1] > 10)) * np.random.rand(nobs) print het_goldfeldquandt(y, x, 1) y = x.sum(1) + 1.01 * (1 - 0.5 * (x[:, 1] > 10)) * np.random.rand(nobs) print het_goldfeldquandt(y, x, 1) print het_breushpagan(y, x) print het_white(y, x) f, p = het_goldfeldquandt(y, x, 1) print f, p resgq = het_goldfeldquandt(y, x, 1, retres=True) print resgq #this is just a syntax check: print neweywestcov(y, x) resols1 = sm.OLS(y, x).fit() print neweywestcov(resols1.resid, x) print resols1.cov_params() print resols1.HC0_se print resols1.cov_HC0 y = x.sum(1) + 10. * (1 - 0.5 * (x[:, 1] > 10)) * np.random.rand(nobs) print HetGoldfeldQuandt().run(y, x, 1, alternative='dec')
import numpy as np import scikits.statsmodels.api as sm # create some data set nsample = 50 sig = 0.25 x1 = np.linspace(0, 20, nsample) X = np.c_[x1, np.sin(x1), (x1-5)**2, np.ones(nsample)] beta = [0.5, 0.5, -0.02, 5.] y_true = np.dot(X, beta) y = y_true + sig * np.random.normal(size=nsample) #setup and estimate the model olsmod = sm.OLS(y, X) olsres = olsmod.fit() print olsres.params print olsres.bse # use predict method of model class, not in the results class # (we had a discussion but it is still in the model) ypred = olsmod.predict(X) # predict insample # create a new sample of explanatory variables Xnew, predict and plot x1n = np.linspace(20.5,25, 10) Xnew = np.c_[x1n, np.sin(x1n), (x1n-5)**2, np.ones(10)] ynewpred = olsmod.predict(Xnew) # predict out of sample print ypred
else: y2 = 19 + 17 * x2 + 2 * np.random.randn(nobs) x2 = sm.add_constant(x2) # stack x = np.concatenate((x1, x2), 0) y = np.concatenate((y1, y2)) if example_groups == '2': groupind = (np.arange(2 * nobs) > nobs - 1).astype(int) else: groupind = np.mod(np.arange(2 * nobs), 4) groupind.sort() #x = np.column_stack((x,x*groupind[:,None])) res1 = sm.OLS(y, x).fit() skip = 8 rresid, rparams, rypred, rresid_standardized, rresid_scaled, rcusum, rcusumci = \ recursive_olsresiduals(res1, skip) print rcusum print rresid_scaled[skip - 1:] assert_almost_equal(rparams[-1], res1.params) import matplotlib.pyplot as plt plt.plot(rcusum) plt.plot(rcusumci[0]) plt.plot(rcusumci[1]) plt.figure() plt.plot(rresid)
def het_goldfeldquandt2(y, x, idx, split=None, retres=False): '''test whether variance is the same in 2 subsamples Parameters ---------- y : array_like endogenous variable x : array_like exogenous variable, regressors idx : integer column index of variable according to which observations are sorted for the split split : None or integer or float in intervall (0,1) index at which sample is split. If 0<split<1 then split is interpreted as fraction of the observations in the first sample retres : boolean if true, then an instance of a result class is returned, otherwise 2 numbers, fvalue and p-value, are returned Returns ------- (fval, pval) or res fval : float value of the F-statistic pval : float p-value of the hypothesis that the variance in one subsample is larger than in the other subsample res : instance of result class The class instance is just a storage for the intermediate and final results that are calculated Notes ----- TODO: add resultinstance - DONE maybe add drop-middle as option maybe allow for several breaks recommendation for users: use this function as pattern for more flexible split in tests, e.g. drop middle. can do Chow test for structural break in same way ran sanity check ''' x = np.asarray(x) y = np.asarray(y) nobs, nvars = x.shape if split is None: split = nobs // 2 elif (0 < split) and (split < 1): split = int(nobs * split) xsortind = np.argsort(x[:, idx]) y = y[xsortind] x = x[xsortind, :] resols1 = sm.OLS(y[:split], x[:split]).fit() resols2 = sm.OLS(y[split:], x[split:]).fit() fval = resols1.mse_resid / resols2.mse_resid if fval > 1: fpval = stats.f.sf(fval, resols1.df_resid, resols2.df_resid) ordering = 'larger' else: fval = 1. / fval fpval = stats.f.sf(fval, resols2.df_resid, resols1.df_resid) ordering = 'smaller' if retres: res = ResultsStore() res.__doc__ = 'Test Results for Goldfeld-Quandt test of heterogeneity' res.fval = fval res.fpval = fpval res.df_fval = (resols2.df_resid, resols1.df_resid) res.resols1 = resols1 res.resols2 = resols2 res.ordering = ordering res.split = split #res.__str__ res._str = '''The Goldfeld-Quandt test for null hypothesis that the variance in the second subsample is %s than in the first subsample: F-statistic =%8.4f and p-value =%8.4f''' % (ordering, fval, fpval) return res else: return fval, fpval
#Note: full parameterization of dummies is orthogonal #np.eye(6)*10 in "large" example print(np.dot(dd_full.T, dd_full) == np.diag(dd_full.sum(0))).all() #check that transforms work #generate 3 data sets with the 3 different parameterizations effect_size = [1., 0.01][1] noise_scale = [0.001, 0.1][0] noise = noise_scale * np.random.randn(nobs) beta = effect_size * np.arange(1, 7) ydata_full = (dd_full * beta).sum(1) + noise ydata_dropl = (dd_dropl * beta).sum(1) + noise ydata_dropf = (dd_dropf * beta).sum(1) + noise resols_full_full = sm.OLS(ydata_full, dd_full).fit() resols_full_dropf = sm.OLS(ydata_full, dd_dropf).fit() params_f_f = resols_full_full.params params_f_df = resols_full_dropf.params resols_dropf_full = sm.OLS(ydata_dropf, dd_full).fit() resols_dropf_dropf = sm.OLS(ydata_dropf, dd_dropf).fit() params_df_f = resols_dropf_full.params params_df_df = resols_dropf_dropf.params tr_of = np.linalg.lstsq(dd_dropf, dd_full)[0] tr_fo = np.linalg.lstsq(dd_full, dd_dropf)[0] print np.dot(tr_fo, params_df_df) - params_df_f print np.dot(tr_of, params_f_f) - params_f_df transf_f_df = DummyTransform(dd_full, dd_dropf)
def run(self, y, x, idx=None, split=None, drop=None, alternative='increasing', attach=True): '''see class docstring''' x = np.asarray(x) y = np.asarray(y) #**2 nobs, nvars = x.shape if split is None: split = nobs // 2 elif (0 < split) and (split < 1): split = int(nobs * split) if drop is None: start2 = split elif (0 < drop) and (drop < 1): start2 = split + int(nobs * drop) else: start2 = split + drop if not idx is None: xsortind = np.argsort(x[:, idx]) y = y[xsortind] x = x[xsortind, :] resols1 = sm.OLS(y[:split], x[:split]).fit() resols2 = sm.OLS(y[start2:], x[start2:]).fit() fval = resols2.mse_resid / resols1.mse_resid #if fval>1: if alternative.lower() in ['i', 'inc', 'increasing']: fpval = stats.f.sf(fval, resols1.df_resid, resols2.df_resid) ordering = 'increasing' elif alternative.lower() in ['d', 'dec', 'decreasing']: fval = 1. / fval fpval = stats.f.sf(fval, resols2.df_resid, resols1.df_resid) ordering = 'decreasing' elif alternative.lower() in ['2', '2-sided', 'two-sided']: fpval_sm = stats.f.cdf(fval, resols2.df_resid, resols1.df_resid) fpval_la = stats.f.sf(fval, resols2.df_resid, resols1.df_resid) fpval = 2 * min(fpval_sm, fpval_la) ordering = 'two-sided' else: raise ValueError('invalid alternative') if attach: res = self res.__doc__ = 'Test Results for Goldfeld-Quandt test of heterogeneity' res.fval = fval res.fpval = fpval res.df_fval = (resols2.df_resid, resols1.df_resid) res.resols1 = resols1 res.resols2 = resols2 res.ordering = ordering res.split = split #res.__str__ #TODO: check if string works res._str = '''The Goldfeld-Quandt test for null hypothesis that the variance in the second subsample is %s than in the first subsample: F-statistic =%8.4f and p-value =%8.4f''' % (ordering, fval, fpval) return fval, fpval, ordering
dta = data.load() gdp = np.log(dta.data['realgdp']) from numpy import polynomial from scipy import special maxorder = 20 polybase = special.chebyt polybase = special.legendre t = np.linspace(-1, 1, len(gdp)) exog = np.column_stack([polybase(i)(t) for i in range(maxorder)]) fitted = [ sm.OLS(gdp, exog[:, :maxr]).fit().fittedvalues for maxr in range(2, maxorder) ] print(np.corrcoef(exog[:, 1:6], rowvar=0) * 10000).astype(int) import matplotlib.pyplot as plt plt.figure() plt.plot(gdp, 'o') for i in range(maxorder - 2): plt.plot(fitted[i]) plt.figure() #plt.plot(gdp, 'o') for i in range(maxorder - 4, maxorder - 2):
def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None): '''estimate the model and selection criteria for up to maxfact factors The selection criteria that are calculated are AIC, BIC, and R2_adj. and additionally cross-validation prediction error sum of squares if `skip_crossval` is false. Cross-validation is not used by default because it can be time consuming to calculate. By default the cross-validation method is Leave-one-out on the full dataset. A different cross-validation sample can be specified as an argument to cv_iter. Results are attached in `results_find_nfact` ''' #print 'OLS on Factors' if not hasattr(self, 'factors'): self.calc_factors() hasconst = self.hasconst if maxfact is None: maxfact = self.factors.shape[1] - hasconst if (maxfact + hasconst) < 1: raise ValueError( 'nothing to do, number of factors (incl. constant) should ' + 'be at least 1') #temporary safety maxfact = min(maxfact, 10) y0 = self.endog results = [] #xred, fact, eva, eve = pca(x0, keepdim=0, normalize=1) for k in range(1, maxfact + hasconst): #k includes now the constnat #xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1) # this is faster and same result fact = self.factors[:, :k] res = sm.OLS(y0, fact).fit() ## print 'k =', k ## print res.params ## print 'aic: ', res.aic ## print 'bic: ', res.bic ## print 'llf: ', res.llf ## print 'R2 ', res.rsquared ## print 'R2 adj', res.rsquared_adj if not skip_crossval: if cv_iter is None: cv_iter = LeaveOneOut(len(y0)) prederr2 = 0. for inidx, outidx in cv_iter: res_l1o = sm.OLS(y0[inidx], fact[inidx, :]).fit() #print data.endog[outidx], res.model.predict(data.exog[outidx,:]), prederr2 += (y0[outidx] - res_l1o.model.predict(fact[outidx, :]))**2. else: prederr2 = np.nan results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2]) self.results_find_nfact = results = np.array(results) self.best_nfact = np.r_[(np.argmin(results[:, 1:3], 0), np.argmax(results[:, 3], 0), np.argmin(results[:, -1], 0))]
def het_breushpagan(resid, x, exog=None): '''Lagrange Multiplier Heteroscedasticity Test by Breush-Pagan The tests the hypothesis that the residual variance does not depend on the variables in x in the form :math: \sigma_i = \\sigma * f(\\alpha_0 + \\alpha z_i) Homoscedasticity implies that $\\alpha=0$ Parameters ---------- resid : arraylike, (nobs,) For the Breush-Pagan test, this should be the residual of a regression. If an array is given in exog, then the residuals are calculated by the an OLS regression or resid on exog. In this case resid should contain the dependent variable. Exog can be the same as x. x : array_like, (nobs, nvars) This contains variables that might create data dependent heteroscedasticity. Returns ------- lm : float lagrange multiplier statistic lm_pvalue :float p-value of lagrange multiplier test fvalue : float f-statistic of the hypothesis that the error variance does not depend on x f_pvalue : float p-value for the f-statistic Notes ----- Assumes x contains constant (for counting dof and calculation of R^2). In the general description of LM test, Greene mentions that this test exaggerates the significance of results in small or moderately large samples. In this case the F-statistic is preferrable. *Verification* Chisquare test statistic is exactly (<1e-13) the same result as bptest in R-stats with defaults (studentize=True). Implementation This is calculated using the generic formula for LM test using $R^2$ (Greene, section 17.6) and not with the explicit formula (Greene, section 11.4.3). References ---------- http://en.wikipedia.org/wiki/Breusch%E2%80%93Pagan_test Greene 5th edition Breush, Pagan article ''' if not exog is None: resid = sm.OLS(y, exog).fit() x = np.asarray(x) y = np.asarray(resid)**2 nobs, nvars = x.shape resols = sm.OLS(y, x).fit() fval = resols.fvalue fpval = resols.f_pvalue lm = nobs * resols.rsquared # Note: degrees of freedom for LM test is nvars minus constant return lm, stats.chi2.sf(lm, nvars - 1), fval, fpval
print 'mutualinfo_kde normed', mutualinfo_kde(y, x) print 'mutualinfo_kde ', mutualinfo_kde(y, x, normed=False) mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \ mutualinfo_binned(y, x, 5, normed=True) print 'mutualinfo_binned normed', mi_normed print 'mutualinfo_binned ', mi_obs.sum() mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \ mutualinfo_binned(y, x, 'auto', normed=True) print 'auto' print 'mutualinfo_binned normed', mi_normed print 'mutualinfo_binned ', mi_obs.sum() ys = np.sort(y) xs = np.sort(x) by = ys[((nobs - 1) * np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)] bx = xs[((nobs - 1) * np.array([0, 0.25, 0.4, 0.6, 0.75, 1])).astype(int)] mi_normed, (pyx2, py2, px2, binsy2, binsx2), mi_obs = \ mutualinfo_binned(y, x, (by,bx), normed=True) print 'quantiles' print 'mutualinfo_binned normed', mi_normed print 'mutualinfo_binned ', mi_obs.sum() doplot = 1 #False if doplot: import matplotlib.pyplot as plt plt.plot(x, y, 'o') olsres = sm.OLS(y, exog).fit() plt.plot(x, olsres.fittedvalues)