def __init__(self, y, x, intercept=True, weights=None, nw_lags=None, nw_overlap=False): import scikits.statsmodels.api as sm self._x_orig = x self._y_orig = y self._weights_orig = weights self._intercept = intercept self._nw_lags = nw_lags self._nw_overlap = nw_overlap (self._y, self._x, self._weights, self._x_filtered, self._index, self._time_has_obs) = self._prepare_data() if self._weights is not None: self._x_trans = self._x.mul(np.sqrt(self._weights), axis=0) self._y_trans = self._y * np.sqrt(self._weights) self.sm_ols = sm.WLS(self._y.values, self._x.values, weights=self._weights.values).fit() else: self._x_trans = self._x self._y_trans = self._y self.sm_ols = sm.OLS(self._y.values, self._x.values).fit()
def _check_wls(self, x, y, weights): result = ols(y=y, x=x, weights=1/weights) combined = x.copy() combined['__y__'] = y combined['__weights__'] = weights combined = combined.dropna() endog = combined.pop('__y__').values aweights = combined.pop('__weights__').values exog = sm.add_constant(combined.values, prepend=False) sm_result = sm.WLS(endog, exog, weights=1/aweights).fit() assert_almost_equal(sm_result.params, result._beta_raw) assert_almost_equal(sm_result.resid, result._resid_raw) self.checkMovingOLS('rolling', x, y, weights=weights) self.checkMovingOLS('expanding', x, y, weights=weights)
ols_fit = sm.OLS(data.endog, data.exog).fit() # perhaps the residuals from this fit depend on the square of income incomesq = data.exog[:, 2] plt.scatter(incomesq, ols_fit.resid) plt.grid() # If we think that the variance is proportional to income**2 # we would want to weight the regression by income # the weights argument in WLS weights the regression by its square root # and since income enters the equation, if we have income/income # it becomes the constant, so we would want to perform # this type of regression without an explicit constant in the design #data.exog = data.exog[:,:-1] wls_fit = sm.WLS(data.endog, data.exog[:, :-1], weights=1 / incomesq).fit() # This however, leads to difficulties in interpreting the post-estimation # statistics. Statsmodels does not yet handle this elegantly, but # the following may be more appropriate # explained sum of squares ess = wls_fit.uncentered_tss - wls_fit.ssr # rsquared rsquared = ess / wls_fit.uncentered_tss # mean squared error of the model mse_model = ess / (wls_fit.df_model + 1) # add back the dof of the constant # f statistic fvalue = mse_model / wls_fit.mse_resid # adjusted r-squared rsquared_adj = 1 - (wls_fit.nobs) / (wls_fit.df_resid) * (1 - rsquared)
#use correction #sandwich estimators of parameter covariance matrix print 'heteroscedasticity corrected standard error of beta estimates' print res2.HC0_se print res2.HC1_se print res2.HC2_se print res2.HC3_se #print res.predict #plt.plot(x1, res2.fittedvalues, '--') #WLS knowing the true variance ratio of heteroscedasticity #^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ print '\nWLS' res3 = sm.WLS(y2, X[:, [0, 2]], 1. / w).fit() print 'WLS beta estimates' print res3.params print 'WLS stddev of beta' print res3.bse #print res.predict #plt.plot(x1, res3.fittedvalues, '--.') #Detour write function for prediction standard errors #Prediction Interval for OLS #--------------------------- covb = res2.cov_params() # full covariance: #predvar = res2.mse_resid + np.diag(np.dot(X2,np.dot(covb,X2.T))) # predication variance only
# generate dataset nsample = 50 x1 = np.linspace(0, 20, nsample) X = np.c_[x1, (x1-5)**2, np.ones(nsample)] np.random.seed(0)#9876789) #9876543) beta = [0.5, -0.01, 5.] y_true2 = np.dot(X, beta) w = np.ones(nsample) w[nsample*6/10:] = 3 sig = 0.5 y2 = y_true2 + sig*w* np.random.normal(size=nsample) X2 = X[:,[0,2]] # estimate OLS, WLS, (OLS not used in these tests) res2 = sm.OLS(y2, X2).fit() res3 = sm.WLS(y2, X2, 1./w).fit() #direct calculation covb = res3.cov_params() predvar = res3.mse_resid*w + (X2 * np.dot(covb,X2.T).T).sum(1) predstd = np.sqrt(predvar) prstd, iv_l, iv_u = wls_prediction_std(res3) np.testing.assert_almost_equal(predstd, prstd, 15) # testing shapes of exog prstd, iv_l, iv_u = wls_prediction_std(res3, X2[-1:,:], weights=3.) np.testing.assert_equal( prstd[-1], prstd) prstd, iv_l, iv_u = wls_prediction_std(res3, X2[-1,:], weights=3.) np.testing.assert_equal( prstd[-1], prstd)
def age_model(indices): return sm.WLS(logwage[indices], age_design(indices), weights = w[indices])
print len(indf) print len(indm) #With each of these models, typically do some #commands to look more at the models, like summary(), #, anova for the model on its own or betwen two models to see #how much additional explantory power you get with the added #variables, and plots to look at residuals, qqplot, and hist of residuals #Currently can't do anova or lowess in python, and the qqplots are annoying #to make. #Initial model, only look at log(hrwage)~sex X1 = hrdat['sex']==2 X1 = sm.add_constant(X1, prepend=True) model1 = sm.WLS(np.log(hrdat['hrwage']), X1, weights = hrdat['A_ERNLWT']) results1 = model1.fit() print results1.summary() #More complicated model, log(hrwage)~sex+educ+age+PTFT n = len(hrdat) logwage = np.log(hrdat['hrwage']) w = hrdat['A_ERNLWT'] X2 = np.hstack((sm.categorical(hrdat['sex'])[:,2:], sm.categorical(hrdat['educ'])[:,2:], hrdat['age'].reshape(n,1),