def test_fixed_scale(self): cov_type = 'fixed_scale' kwds = {} res1 = self.res_ols.get_robustcov_results(cov_type, **kwds) res2 = self.res_wls.get_robustcov_results(cov_type, **kwds) assert_allclose(res1.params, res2.params, rtol=1e-13) assert_allclose(res1.cov_params(), res2.cov_params(), rtol=1e-13) assert_allclose(res1.bse, res2.bse, rtol=1e-13) assert_allclose(res1.pvalues, res2.pvalues, rtol=1e-12) tt = res2.t_test(np.eye(len(res2.params)), cov_p=res2.normalized_cov_params) assert_allclose(res2.cov_params(), res2.normalized_cov_params, rtol=1e-13) assert_allclose(res2.bse, tt.sd, rtol=1e-13) assert_allclose(res2.pvalues, tt.pvalue, rtol=1e-13) assert_allclose(res2.tvalues, tt.tvalue, rtol=1e-13) # using cov_type in fit mod = self.res_wls.model mod3 = WLS(mod.endog, mod.exog, weights=mod.weights) res3 = mod3.fit(cov_type=cov_type, cov_kwds=kwds) tt = res3.t_test(np.eye(len(res3.params)), cov_p=res3.normalized_cov_params) assert_allclose(res3.cov_params(), res3.normalized_cov_params, rtol=1e-13) assert_allclose(res3.bse, tt.sd, rtol=1e-13) assert_allclose(res3.pvalues, tt.pvalue, rtol=1e-13) assert_allclose(res3.tvalues, tt.tvalue, rtol=1e-13)
def test_regularized_weights(self): np.random.seed(1432) exog1 = np.random.normal(size=(100, 3)) endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100) exog2 = np.random.normal(size=(100, 3)) endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100) exog_a = np.vstack((exog1, exog1, exog2)) endog_a = np.concatenate((endog1, endog1, endog2)) # Should be equivalent to exog_a, endog_a. exog_b = np.vstack((exog1, exog2)) endog_b = np.concatenate((endog1, endog2)) wgts = np.ones(200) wgts[0:100] = 2 sigma = np.diag(1/wgts) for L1_wt in 0, 0.5, 1: for alpha in 0, 1: mod1 = OLS(endog_a, exog_a) rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod2 = WLS(endog_b, exog_b, weights=wgts) rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha) mod3 = GLS(endog_b, exog_b, sigma=sigma) rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha) assert_almost_equal(rslt1.params, rslt2.params, decimal=3) assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
def setup_class(cls): nobs, k_exog = 100, 5 np.random.seed(987125) x = np.random.randn(nobs, k_exog - 1) x = add_constant(x) cls.aweights = np.random.randint(1, 10, nobs) y_true = x.sum(1) / 2 y = y_true + 2 * np.random.randn(nobs) cls.endog = y cls.exog = x cls.idx_uc = [0, 2, 3, 4] cls.idx_p_uc = np.array(cls.idx_uc) cls.idx_c = [1] cls.exogc = xc = x[:, cls.idx_uc] mod_ols_c = WLS(y - 0.5 * x[:, 1], xc, weights=cls.aweights) mod_ols_c.exog_names[:] = ['const', 'x2', 'x3', 'x4'] cls.mod2 = mod_ols_c cls.init()
def setup_class(cls): # from example wls.py nsample = 50 x = np.linspace(0, 20, nsample) X = np.column_stack((x, (x - 5)**2)) from statsmodels.tools.tools import add_constant X = add_constant(X) beta = [5., 0.5, -0.01] sig = 0.5 w = np.ones(nsample) w[int(nsample * 6. / 10):] = 3 y_true = np.dot(X, beta) e = np.random.normal(size=nsample) y = y_true + sig * w * e X = X[:,[0,1]] # ### WLS knowing the true variance ratio of heteroscedasticity mod_wls = WLS(y, X, weights=1./w) cls.res_wls = mod_wls.fit()
error = y - y_pred mse = (error * error).mean() print(mse) res_ols = OLS(y, exog[:, :3]).fit() print(np.squeeze(pmod.coef) - res_ols.params) weights = np.ones(nobs) weights[:nobs // 3] = 0.1 weights[-nobs // 5:] = 2 pmodw = smoothers.PolySmoother(2, x) pmodw.fit(y, weights=weights) #no return y_predw = pmodw.predict(x) error = y - y_predw mse = (error * error).mean() print(mse) res_wls = WLS(y, exog[:, :3], weights=weights).fit() print(np.squeeze(pmodw.coef) - res_wls.params) doplot = 1 if doplot: import matplotlib.pyplot as plt plt.plot(y, '.') plt.plot(y_true, 'b-', label='true') plt.plot(y_pred, '-', label='poly') plt.plot(y_predw, '-', label='poly -w') plt.legend(loc='upper left') plt.close() #plt.show()
def test_predict_se(): # this test doesn't use reference values # checks conistency across options, and compares to direct calculation # generate dataset nsample = 50 x1 = np.linspace(0, 20, nsample) x = np.c_[x1, (x1 - 5)**2, np.ones(nsample)] np.random.seed(0)#9876789) #9876543) beta = [0.5, -0.01, 5.] y_true2 = np.dot(x, beta) w = np.ones(nsample) w[int(nsample * 6. / 10):] = 3 sig = 0.5 y2 = y_true2 + sig * w * np.random.normal(size=nsample) x2 = x[:,[0,2]] # estimate OLS res2 = OLS(y2, x2).fit() #direct calculation covb = res2.cov_params() predvar = res2.mse_resid + (x2 * np.dot(covb, x2.T).T).sum(1) predstd = np.sqrt(predvar) prstd, iv_l, iv_u = wls_prediction_std(res2) np.testing.assert_almost_equal(prstd, predstd, 15) #stats.t.isf(0.05/2., 50 - 2) q = 2.0106347546964458 ci_half = q * predstd np.testing.assert_allclose(iv_u, res2.fittedvalues + ci_half, rtol=1e-12) np.testing.assert_allclose(iv_l, res2.fittedvalues - ci_half, rtol=1e-12) prstd, iv_l, iv_u = wls_prediction_std(res2, x2[:3,:]) np.testing.assert_equal(prstd, prstd[:3]) np.testing.assert_allclose(iv_u, res2.fittedvalues[:3] + ci_half[:3], rtol=1e-12) np.testing.assert_allclose(iv_l, res2.fittedvalues[:3] - ci_half[:3], rtol=1e-12) # check WLS res3 = WLS(y2, x2, 1. / w).fit() #direct calculation covb = res3.cov_params() predvar = res3.mse_resid * w + (x2 * np.dot(covb, x2.T).T).sum(1) predstd = np.sqrt(predvar) prstd, iv_l, iv_u = wls_prediction_std(res3) np.testing.assert_almost_equal(prstd, predstd, 15) #stats.t.isf(0.05/2., 50 - 2) q = 2.0106347546964458 ci_half = q * predstd np.testing.assert_allclose(iv_u, res3.fittedvalues + ci_half, rtol=1e-12) np.testing.assert_allclose(iv_l, res3.fittedvalues - ci_half, rtol=1e-12) # testing shapes of exog prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1:,:], weights=3.) np.testing.assert_equal(prstd, prstd[-1]) prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1,:], weights=3.) np.testing.assert_equal(prstd, prstd[-1]) prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-2:,:], weights=3.) np.testing.assert_equal(prstd, prstd[-2:]) prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-2:,:], weights=[3, 3]) np.testing.assert_equal(prstd, prstd[-2:]) prstd, iv_l, iv_u = wls_prediction_std(res3, x2[:3,:]) np.testing.assert_equal(prstd, prstd[:3]) np.testing.assert_allclose(iv_u, res3.fittedvalues[:3] + ci_half[:3], rtol=1e-12) np.testing.assert_allclose(iv_l, res3.fittedvalues[:3] - ci_half[:3], rtol=1e-12) #use wrong size for exog #prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1,0], weights=3.) np.testing.assert_raises(ValueError, wls_prediction_std, res3, x2[-1,0], weights=3.) # check some weight values sew1 = wls_prediction_std(res3, x2[-3:,:])[0]**2 for wv in np.linspace(0.5, 3, 5): sew = wls_prediction_std(res3, x2[-3:,:], weights=1. / wv)[0]**2 np.testing.assert_allclose(sew, sew1 + res3.scale * (wv - 1))
def fitjoint(self): '''fit a joint fixed effects model to all observations The regression results are attached as `lsjoint`. The contrasts for overall and pairwise tests for equality of coefficients are attached as a dictionary `contrasts`. This also includes the contrasts for the test that the coefficients of a level are zero. :: >>> res.contrasts.keys() [(0, 1), 1, 'all', 3, (1, 2), 2, (1, 3), (2, 3), (0, 3), (0, 2)] The keys are based on the original names or labels of the groups. TODO: keys can be numpy scalars and then the keys cannot be sorted ''' if not hasattr(self, 'weights'): self.fitbygroups() groupdummy = (self.groupsint[:, None] == self.uniqueint).astype(int) #order of dummy variables by variable - not used #dummyexog = self.exog[:,:,None]*groupdummy[:,None,1:] #order of dummy variables by grous - used dummyexog = self.exog[:, None, :] * groupdummy[:, 1:, None] exog = np.c_[self.exog, dummyexog.reshape(self.exog.shape[0], -1)] #self.nobs ?? #Notes: I changed to drop first group from dummy #instead I want one full set dummies if self.het: weights = self.weights res = WLS(self.endog, exog, weights=weights).fit() else: res = OLS(self.endog, exog).fit() self.lsjoint = res contrasts = {} nvars = self.exog.shape[1] nparams = exog.shape[1] ndummies = nparams - nvars contrasts['all'] = np.c_[np.zeros((ndummies, nvars)), np.eye(ndummies)] for groupind, group in enumerate( self.unique[1:]): #need enumerate if groups != groupsint groupind = groupind + 1 contr = np.zeros((nvars, nparams)) contr[:, nvars * groupind:nvars * (groupind + 1)] = np.eye(nvars) contrasts[group] = contr #save also for pairs, see next contrasts[(self.unique[0], group)] = contr #Note: I'm keeping some duplication for testing pairs = np.triu_indices(len(self.unique), 1) for ind1, ind2 in zip( *pairs): #replace with group1, group2 in sorted(keys) if ind1 == 0: continue # need comparison with benchmark/normalization group separate g1 = self.unique[ind1] g2 = self.unique[ind2] group = (g1, g2) contr = np.zeros((nvars, nparams)) contr[:, nvars * ind1:nvars * (ind1 + 1)] = np.eye(nvars) contr[:, nvars * ind2:nvars * (ind2 + 1)] = -np.eye(nvars) contrasts[group] = contr self.contrasts = contrasts
def _engine_factory(self, fy, X, check_integrity=True): if self.use_weighted_fit: return WLS(fy, X, weights=self._get_weights()) else: return OLS(fy, X)
def gates(y, d, prop, s_hat, q=10, print_table=True): """Calculate Group Average Treatment Effect Parameters ---------- y : ndarray vector of outcomes d : ndarray treatment indicator prop : ndarray treatment propensity s_hat : ndarray estimated treatment effect q : int, optional number of groups, by default 10 print_table : bool, optional toggle results table, by default True Returns ------- dict results with baseline and treatment effect for each group """ # Define groups bin_indices, bin_edges, bin_pct = quantile_grid( x=s_hat + 1e-16 * np.random.uniform(size=len(s_hat)), q=q # Break ties ) # Dummy coding s_onehot = np.zeros((len(s_hat), len(bin_edges))) s_onehot[np.arange(0, len(s_hat)), bin_indices] = 1 # Calculate model matrix x_reg = np.column_stack( (s_onehot, s_onehot * np.reshape(d - prop, newshape=(-1, 1))) ) w_reg = (prop * (1 - prop)) ** (-1) # weights y_reg = y # Run weighted least squares labels_baseline = [ f"Baseline: p={p / 100:.2f} ({x:.2f})" for p, x in zip(bin_pct.tolist(), bin_edges.tolist()) ] labels_treatment = [ f"Treatment: p={p / 100:.2f} ({x:.2f})" for p, x in zip(bin_pct.tolist(), bin_edges.tolist()) ] labels = labels_baseline + labels_treatment wls = WLS(endog=y_reg, exog=x_reg, w=w_reg) wls = wls.fit() if print_table: print(wls.summary(xname=labels)) return { "coef_baseline": wls.params[: len(labels_baseline)], "coef_treatment": wls.params[len(labels_baseline) :], "bin_values": bin_edges, "bin_count": np.sum(s_onehot, axis=0), }
def setup_class(cls): from statsmodels.datasets.ccard import load data = load(as_pandas=False) cls.res1 = WLS(data.endog, data.exog, weights=1 / data.exog[:, 2]).fit() cls.res2 = GLS(data.endog, data.exog, sigma=data.exog[:, 2]).fit()
def setup_class(cls): cls.exog = np.ones((1, )) cls.endog = np.ones((1, )) weights = 1 cls.wls_res = WLS(cls.endog, cls.exog, weights=weights).fit()
def mr_presso(self, n_sims=1000, significance_thresh=0.05): """ Python reimplementation of MR-PRESSO. :param n_sims: number of permutation simulations. :param significance_thresh: significance thresshold. :return: beta, se and p value of the estimate after the bad snps were removed. If no estimate can be made, returns a tuple of 3* (np.nan). """ def make_random_data(): beta_ivw, _, _ = self.do_ivw_estimation() random_exposure = np.random.normal( [x[0] for x in self.exposure_tuples], [x[1] for x in self.exposure_tuples]) random_outcome = np.random.normal( [beta_ivw * x[0] for x in self.exposure_tuples], [x[1] for x in self.outcome_tuples]) mr_estimates = np.zeros((len(random_outcome), 3), dtype=float) for i in range(len(random_outcome)): mr_estimates[i, :] = self.do_single_term_mr_estimate( (random_exposure[i], self.exposure_tuples[i][1]), (random_outcome[i], self.outcome_tuples[i][1])) return random_exposure, random_outcome, mr_estimates def leave_one_out_residual_sum_of_squares(estimation_data, weighted_outcome, weighted_exposure): estimation_data = np.asarray(estimation_data) leave_one_out_ivw = np.zeros(shape=(len(estimation_data), 3), dtype=float) for i in range(len(estimation_data)): leave_one_out_ivw[ i, :] = self.do_ivw_estimation_on_estimate_vector( np.delete(estimation_data, i, 0)) rss = (weighted_outcome - leave_one_out_ivw[:, 0] * weighted_exposure)**2 return rss, leave_one_out_ivw def make_random_data_and_return_rss(weights): exposure, outcome, mr_estimates = make_random_data() weighted_exposure = exposure * weights weighted_outcome = outcome * weights rss, _ = leave_one_out_residual_sum_of_squares( mr_estimates, weighted_outcome, weighted_exposure) return np.sum(rss), np.concatenate((exposure.reshape( len(exposure), 1), outcome.reshape(len(outcome), 1)), axis=1) def randomly_sample_distortion(outlier_indices): estimates = np.asarray(self.estimation_data) estimates_no_outliers = np.delete(estimates, outlier_indices, axis=0) estimates_only_outliers = estimates[outlier_indices, :][0] indices_sampled_from_no_outliers = np.random.choice( estimates_no_outliers.shape[0], size=estimates_no_outliers.shape[0], replace=True) return self.do_ivw_estimation_on_estimate_vector( np.concatenate(( estimates_no_outliers[indices_sampled_from_no_outliers, :], estimates_only_outliers))) # runtime checks. num_estimates = len(self.estimation_data) if num_estimates < 3: raise ValueError( "Only {} estimates supplied, need at least three to find simulate_mr presso outliers" .format(num_estimates)) if len(self.exposure_tuples) != num_estimates: raise ValueError( "No exposure sumstats present, cannot do mr_presso outlier.") if len(self.outcome_tuples) != num_estimates: raise ValueError( "No outcome sumstats present, cannot do mr_presso outlier.") # this is just following MR presso. outcome = np.asarray(self.outcome_tuples, dtype=float) exposure = np.asarray(self.exposure_tuples, dtype=float) weighted_outcome = np.asarray( [x[0] / np.sqrt(x[1]**2) for x in self.outcome_tuples], dtype=float) weighted_exposure = np.asarray([ self.exposure_tuples[i][0] / np.sqrt(self.outcome_tuples[i][1]**2) for i in range(len(self.exposure_tuples)) ], dtype=float) weights = np.asarray( [1 / np.sqrt(x[1]**2) for x in self.outcome_tuples], dtype=float) rss, list_of_assocs = leave_one_out_residual_sum_of_squares( self.estimation_data, weighted_outcome, weighted_exposure) expected_results = [ make_random_data_and_return_rss(weights) for _ in range(n_sims) ] sim_rss = [x[0] for x in expected_results] global_p_val = sum(sim_rss > sum(rss)) / n_sims local_p_val = None if global_p_val < significance_thresh: expected_betas = np.zeros((num_estimates, n_sims, 2), dtype=float) for i in range(n_sims): expected_betas[:, i] = expected_results[i][1] difference = outcome[:, 0] - exposure[:, 0] * list_of_assocs[:, 0] expected_difference = expected_betas[:, :, 1] - expected_betas[:, :, 0] * np.tile( list_of_assocs[:, 0], (n_sims, 1)).transpose() local_p_val = np.sum(expected_difference**2 > (difference**2).reshape((len(difference), 1)), axis=1) / n_sims local_p_val = np.asarray([ x * len(difference) if x * len(difference) < 1.0 else 1.0 for x in local_p_val ]) # distortion test. outlier_corrected_ivw_result = (np.nan, np.nan, np.nan) if local_p_val is not None and sum(local_p_val < significance_thresh): outliers = local_p_val < significance_thresh exposure_betas = [ self.exposure_tuples[i][0] for i in range(len(self.estimation_data)) if not outliers[i] ] outcome_betas = [ self.outcome_tuples[i][0] for i in range(len(self.estimation_data)) if not outliers[i] ] weights = [ 1 / self.outcome_tuples[i][1]**2 for i in range(len(self.estimation_data)) if not outliers[i] ] outlier_corrected_ivw_result = WLS(exog=exposure_betas, endog=outcome_betas, weights=weights).fit() return outlier_corrected_ivw_result.params[ 0], outlier_corrected_ivw_result.bse[ 0], outlier_corrected_ivw_result.pvalues[0] else: return outlier_corrected_ivw_result
class LinearRegression(object): ''' Patsy wrapper for linear estimation and prediction. Uses statsmodels WLS to allow weights. If no weights are provided, results are equivalent to OLS. ''' def __init__(self, formula=None, data=None, **kwargs): # convert all variables raised to a power to float64 # this prevents mis-specification of probabilities in cases of variable overflow # (if the original var was compressed to a smaller bit integer/float) if type(data) == pd.DataFrame: power_vars = list(set(re.findall(r'(?<=power\().+?(?=,)', formula))) for var in power_vars: data[var] = data[var].astype('float64') if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = WLS(y, X, **kwargs) self._fit = self._model.fit() self._betas = self._fit.params self._std = np.std(data[self._model.data.ynames].values - self.predict(data)) self._r2 = self._fit.rsquared self._r2_adj = self._fit.rsquared_adj else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None self._std = None self._r2 = None self._r2_adj = None def __repr__(self): return str(self._fit.summary()) def predict(self, data): ''' Returns fitted values for the data provided. ''' if len(data) == 0: return [] # identifies exponential variables from the design matrix (via the 'power' flag) and converts to float64 # this prevents mis-specification of probabilities in cases of variable overflow # (if the original var was compressed to a smaller bit integer/float) power_vars = list(set([ re.search(r'(?<=power\().+?(?=,)', column).group() for column in \ self._X_design_info.column_names if 'power' in column ])) for var in power_vars: data[var] = data[var].astype('float64') (X, ) = patsy.build_design_matrices([self._X_design_info], data) return linear_transform(np.asarray(X), self._betas) def residuals(self, data): ''' Returns residuals from fitting the model to the data provided. ''' if len(data) == 0: return [] return data[self._model.data.ynames].values - self.predict(data) def draw(self, data, rand_engine): ''' Returns fitted values for the data provided plus a random draw from a normal distribution with the regression standard error. ''' return self.predict(data) + rand_engine.normal(0, self._std, len(data)) def Rsquared(self, adjusted=True): ''' Returns the model's adjusted R squared. To return unadjusted R squared, pass adjusted=False. ''' if adjusted: return self._r2_adj else: return self._r2 def to_pickle(self, filename): ''' Writes basic model information to a pickle file. ''' pickle.dump((self._y_design_info, self._X_design_info, self._betas, self._std, self._r2, self._r2_adj), open(filename, "wb")) @staticmethod def read_pickle(filename): ''' Reads basic model information from a pickle file. Returns a LinearRegression object that does not include the model summary or fit object but can execute all class functions. ''' y_design_info, X_design_info, betas, std, r2, r2_adj = pickle.load( open(filename, "rb")) linear_regression = LinearRegression() linear_regression._y_design_info = y_design_info linear_regression._X_design_info = X_design_info linear_regression._betas = betas linear_regression._std = std linear_regression._r2 = r2 linear_regression._r2_adj = r2_adj return linear_regression def __add__(self, other): ret = copy(self) ret._betas = self._betas + other._betas return ret def __sub__(self, other): ret = copy(self) ret._betas = self._betas - other._betas return ret def __mul__(self, other): ret = copy(self) ret._betas = ret._betas * other return ret
def _engine_factory(self, fy, X): if self.use_weighted_fit: return WLS(fy, X, weights=self._get_weights()) else: return OLS(fy, X)
def fit_pval_model( quantiles: pd.DataFrame, small_order: int = 3, use_log: bool = False, drop_insignif: bool = True, ) -> PvalueResult: if small_order not in (3, 4): raise ValueError("Small order must be 3 or 4") quantiles = quantiles.sort_index(ascending=False) percentiles = quantiles.index.to_numpy() lhs = stats.norm.ppf(percentiles) data = np.asarray(quantiles) avg_test_stats = data.mean(1) avg_test_std = data.std(1) avg_test_stats = avg_test_stats[:, None] rhs = avg_test_stats**np.arange(4) rhs_large = rhs rhs_log = np.log(np.abs(avg_test_stats))**np.arange(4) lhs_large = lhs res_large = WLS(lhs_large, rhs, weights=1.0 / avg_test_std).fit() temp = res_large.params.copy() if drop_insignif: temp[res_large.pvalues > 0.05] = 0.0 large_p = temp # Compute tau_max, by finding the func maximum p = res_large.params poly_roots = np.roots(np.array([3, 2, 1.0]) * p[:0:-1]) if np.isreal(poly_roots[0]): tau_max = float(np.squeeze(np.real(np.max(poly_roots)))) else: tau_max = np.inf # Small p regression using only p<=15% cutoff = np.where(percentiles <= 0.150)[0] lhs_small = lhs[cutoff] if use_log: avg_test_stats = np.log(np.abs(avg_test_stats[cutoff])) avg_test_std = np.log(np.abs(data[cutoff])).std(1) assert np.all(np.isfinite(avg_test_std)) rhs = avg_test_stats**np.arange(small_order) else: avg_test_stats = avg_test_stats[cutoff] avg_test_std = avg_test_std[cutoff] rhs = avg_test_stats**np.arange(small_order) res_small = WLS(lhs_small, rhs, weights=1.0 / avg_test_std).fit() temp = res_small.params if drop_insignif: temp[res_small.pvalues > 0.05] = 0.0 small_p = temp # Compute tau star err_large = lhs_large - rhs_large.dot(large_p) params = small_p.copy() if small_order == 3: # Missing 1 parameter here, replace with 0 params = np.append(params, 0.0) if use_log: pred_small = rhs_log.dot(params) else: pred_small = rhs_large.dot(params) err_small = lhs_large - pred_small # Find the location that minimizes the total absolute error m = lhs_large.shape[0] abs_err = np.zeros((m, 1)) for j in range(m): abs_err[j] = np.abs(err_large[:j]).sum() + np.abs(err_small[j:]).sum() loc = np.argmin(abs_err) tau_star = rhs_large[loc, 1] if use_log: assert tau_star < 0 # Compute tau min tau_min = -params[1] / (2 * params[2]) if use_log: assert small_order == 4 assert params[2] * params[3] < 0 tau_min = -np.inf large_p = [round(val, 5) for val in large_p] small_p = [round(val, 5) for val in small_p] tau_max = round(tau_max, 5) tau_star = round(tau_star, 5) tau_min = round(tau_min, 5) return PvalueResult(large_p, small_p, tau_max, tau_star, tau_min)
class LinearRegression(object): """Patsy wrapper for linear estimation and prediction. """ def __init__(self, formula=None, data=None, **kwargs): if formula: y, X = patsy.dmatrices(formula, data, 1) self._y_design_info = y.design_info self._X_design_info = X.design_info self._model = WLS(y, X, **kwargs) self._fit = self._model.fit() self._betas = self._fit.params self._std = numpy.std(data[self._model.data.ynames].values - self.predict(data)) else: self._y_design_info = None self._X_design_info = None self._model = None self._fit = None self._betas = None self._std = None def __repr__(self): return str(self._fit.summary()) def predict(self, data): if len(data) == 0: return [] (X, ) = patsy.build_design_matrices([self._X_design_info], data) return linear_transform(numpy.asarray(X), self._betas) def draw(self, data, rand_engine): return self.predict(data) + rand_engine.normal(0, self._std, len(data)) def to_pickle(self, filename): pickle.dump((self._y_design_info, self._X_design_info, self._betas, self._std), open(filename, "wb")) @staticmethod def read_pickle(filename): y_design_info, X_design_info, betas, std = pickle.load(open(filename, "rb")) linear_regression = LinearRegression() linear_regression._y_design_info = y_design_info linear_regression._X_design_info = X_design_info linear_regression._betas = betas linear_regression._std = std return linear_regression def __add__(self, other): ret = copy(self) ret._betas = self._betas + other._betas return ret def __sub__(self, other): ret = copy(self) ret._betas = self._betas - other._betas return ret def __mul__(self, other): ret = copy(self) ret._betas = ret._betas * other return ret
X = np.column_stack((x, (x - 5)**2)) from statsmodels.tools.tools import add_constant X = add_constant(X) beta = [5., 0.5, -0.01] sig = 0.5 w = np.ones(nsample) w[nsample * 6/10:] = 3 y_true = np.dot(X, beta) e = np.random.normal(size=nsample) y = y_true + sig * w * e X = X[:,[0,1]] # ### WLS knowing the true variance ratio of heteroscedasticity mod_wls = WLS(y, X, weights=1./w) res_wls = mod_wls.fit() prstd, iv_l, iv_u = wls_prediction_std(res_wls) pred_res = get_prediction(res_wls) ci = pred_res.conf_int(obs=True) from numpy.testing import assert_allclose assert_allclose(pred_res.se_obs, prstd, rtol=1e-13) assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13) print(pred_res.summary_frame().head()) pred_res2 = res_wls.get_prediction()
def test_predict_se(): # this test doesn't use reference values # checks conistency across options, and compares to direct calculation # generate dataset nsample = 50 x1 = np.linspace(0, 20, nsample) x = np.c_[x1, (x1 - 5)**2, np.ones(nsample)] np.random.seed(0) #9876789) #9876543) beta = [0.5, -0.01, 5.] y_true2 = np.dot(x, beta) w = np.ones(nsample) w[int(nsample * 6. / 10):] = 3 sig = 0.5 y2 = y_true2 + sig * w * np.random.normal(size=nsample) x2 = x[:, [0, 2]] # estimate OLS res2 = OLS(y2, x2).fit() #direct calculation covb = res2.cov_params() predvar = res2.mse_resid + (x2 * np.dot(covb, x2.T).T).sum(1) predstd = np.sqrt(predvar) prstd, iv_l, iv_u = wls_prediction_std(res2) np.testing.assert_almost_equal(prstd, predstd, 15) #stats.t.isf(0.05/2., 50 - 2) q = 2.0106347546964458 ci_half = q * predstd np.testing.assert_allclose(iv_u, res2.fittedvalues + ci_half, rtol=1e-12) np.testing.assert_allclose(iv_l, res2.fittedvalues - ci_half, rtol=1e-12) prstd, iv_l, iv_u = wls_prediction_std(res2, x2[:3, :]) np.testing.assert_equal(prstd, prstd[:3]) np.testing.assert_allclose(iv_u, res2.fittedvalues[:3] + ci_half[:3], rtol=1e-12) np.testing.assert_allclose(iv_l, res2.fittedvalues[:3] - ci_half[:3], rtol=1e-12) # check WLS res3 = WLS(y2, x2, 1. / w).fit() #direct calculation covb = res3.cov_params() predvar = res3.mse_resid * w + (x2 * np.dot(covb, x2.T).T).sum(1) predstd = np.sqrt(predvar) prstd, iv_l, iv_u = wls_prediction_std(res3) np.testing.assert_almost_equal(prstd, predstd, 15) #stats.t.isf(0.05/2., 50 - 2) q = 2.0106347546964458 ci_half = q * predstd np.testing.assert_allclose(iv_u, res3.fittedvalues + ci_half, rtol=1e-12) np.testing.assert_allclose(iv_l, res3.fittedvalues - ci_half, rtol=1e-12) # testing shapes of exog prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1:, :], weights=3.) np.testing.assert_equal(prstd, prstd[-1]) prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1, :], weights=3.) np.testing.assert_equal(prstd, prstd[-1]) prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-2:, :], weights=3.) np.testing.assert_equal(prstd, prstd[-2:]) prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-2:, :], weights=[3, 3]) np.testing.assert_equal(prstd, prstd[-2:]) prstd, iv_l, iv_u = wls_prediction_std(res3, x2[:3, :]) np.testing.assert_equal(prstd, prstd[:3]) np.testing.assert_allclose(iv_u, res3.fittedvalues[:3] + ci_half[:3], rtol=1e-12) np.testing.assert_allclose(iv_l, res3.fittedvalues[:3] - ci_half[:3], rtol=1e-12) #use wrong size for exog #prstd, iv_l, iv_u = wls_prediction_std(res3, x2[-1,0], weights=3.) np.testing.assert_raises(ValueError, wls_prediction_std, res3, x2[-1, 0], weights=3.) # check some weight values sew1 = wls_prediction_std(res3, x2[-3:, :])[0]**2 for wv in np.linspace(0.5, 3, 5): sew = wls_prediction_std(res3, x2[-3:, :], weights=1. / wv)[0]**2 np.testing.assert_allclose(sew, sew1 + res3.scale * (wv - 1))
def do_egger_regression_two_variance_term(self): """ Does egger regression based on two variance term estimates. :return: list of length two each with a tuple of floats: beta, se, wald_p_val of the estimate for intercept and slope respectively. """ num_estimates = len(self.estimation_data) # runtime checks. if num_estimates < 3: raise ValueError( "Only {} estimates supplied, need at least three to estimate egger" .format(num_estimates)) if len(self.exposure_tuples) != num_estimates: raise ValueError( "No exposure data present, cannot do Egger regression.") if len(self.outcome_tuples) != num_estimates: raise ValueError( "No outcome data present, cannot do Egger regression.") """ Now turn exposure into positive values. """ outcome_tuples = copy.deepcopy(self.outcome_tuples) exposure_tuples = copy.deepcopy(self.exposure_tuples) for i in range(num_estimates): if exposure_tuples[i][0] < 0: # flip. exposure_tuples[i] = (-1 * exposure_tuples[i][0], exposure_tuples[i][0]) outcome_tuples[i] = (-1 * outcome_tuples[i][0], outcome_tuples[i][0]) x_dat = np.asarray([x[0] for x in exposure_tuples]) x_dat = add_constant(x_dat) y_dat = np.asarray([x[0] for x in outcome_tuples]) #if this value is zero, we add the smallest possible constant, so it can still be used as weights. #checked with the 2015 paper introducing MR-Egger, and it works as expected. w_dat = np.zeros(len(self.estimation_data)) for i in range(len(self.estimation_data)): w_dat[i] = outcome_tuples[i][0] ** -2 / \ ( (outcome_tuples[i][0]**-2 * outcome_tuples[i][1] ** 2) + (exposure_tuples[i][0]**-2 * exposure_tuples[i][1] ** 2) ) wls_model = WLS(y_dat, x_dat, weights=w_dat) results = wls_model.fit() self.egger_intercept = (results.params[0], results.bse[0], results.pvalues[0]) self.egger_slope = (results.params[1], results.bse[1], results.pvalues[1]) self.egger_done = True return self.egger_intercept, self.egger_slope
X = np.column_stack((x, (x - 5)**2)) from statsmodels.tools.tools import add_constant X = add_constant(X) beta = [5., 0.5, -0.01] sig = 0.5 w = np.ones(nsample) w[nsample * 6 / 10:] = 3 y_true = np.dot(X, beta) e = np.random.normal(size=nsample) y = y_true + sig * w * e X = X[:, [0, 1]] # ### WLS knowing the true variance ratio of heteroscedasticity mod_wls = WLS(y, X, weights=1. / w) res_wls = mod_wls.fit() prstd, iv_l, iv_u = wls_prediction_std(res_wls) pred_res = get_prediction(res_wls) ci = pred_res.conf_int(obs=True) from numpy.testing import assert_allclose assert_allclose(pred_res.se_obs, prstd, rtol=1e-13) assert_allclose(ci, np.column_stack((iv_l, iv_u)), rtol=1e-13) print pred_res.summary_frame().head() pred_res2 = res_wls.get_prediction() ci2 = pred_res2.conf_int(obs=True)
def _start_params(self, niter=2, return_intermediate=False): """find starting values Parameters ---------- niter : int Number of iterations of WLS approximation return_intermediate : bool If False (default), then only the preliminary parameter estimate will be returned. If True, then also the two results instances of the WLS estimate for mean parameters and for the precision parameters will be returned. Returns ------- sp : ndarray start parameters for the optimization res_m2 : results instance (optional) Results instance for the WLS regression of the mean function. res_p2 : results instance (optional) Results instance for the WLS regression of the precision function. Notes ----- This calculates a few iteration of weighted least squares. This is not a full scoring algorithm. """ # WLS of the mean equation uses the implied weights (inverse variance), # WLS for the precision equations uses weights that only take # account of the link transformation of the precision endog. from statsmodels.regression.linear_model import OLS, WLS res_m = OLS(self.link(self.endog), self.exog).fit() fitted = self.link.inverse(res_m.fittedvalues) resid = self.endog - fitted prec_i = fitted * (1 - fitted) / np.maximum(np.abs(resid), 1e-2)**2 - 1 res_p = OLS(self.link_precision(prec_i), self.exog_precision).fit() prec_fitted = self.link_precision.inverse(res_p.fittedvalues) # sp = np.concatenate((res_m.params, res_p.params)) for _ in range(niter): y_var_inv = (1 + prec_fitted) / (fitted * (1 - fitted)) # y_var = fitted * (1 - fitted) / (1 + prec_fitted) ylink_var_inv = y_var_inv / self.link.deriv(fitted)**2 res_m2 = WLS(self.link(self.endog), self.exog, weights=ylink_var_inv).fit() fitted = self.link.inverse(res_m2.fittedvalues) resid2 = self.endog - fitted prec_i2 = (fitted * (1 - fitted) / np.maximum(np.abs(resid2), 1e-2)**2 - 1) w_p = 1. / self.link_precision.deriv(prec_fitted)**2 res_p2 = WLS(self.link_precision(prec_i2), self.exog_precision, weights=w_p).fit() prec_fitted = self.link_precision.inverse(res_p2.fittedvalues) sp2 = np.concatenate((res_m2.params, res_p2.params)) if return_intermediate: return sp2, res_m2, res_p2 return sp2
def setup_class(cls): data = longley.load(as_pandas=False) data.exog = add_constant(data.exog, prepend=False) cls.res1 = OLS(data.endog, data.exog).fit() cls.res2 = WLS(data.endog, data.exog).fit()
def setupClass(cls): data = longley.load() data.exog = add_constant(data.exog) cls.res1 = OLS(data.endog, data.exog).fit() cls.res2 = WLS(data.endog, data.exog).fit()
percentiles = data['percentiles'] results = data['results'] # Remove later # LHS is norm cdf inv of percentiles lhs = norm().ppf(percentiles / 100.0) lhs_large = lhs # RHS is made up of avg test stats for largest T, which is in pos 1 avg_test_stats = results[:, 1, :].mean(axis=1) avg_test_std = results[:, 1, :].std(axis=1) avg_test_stats = avg_test_stats[:, None] m = lhs.shape[0] rhs = np.ones((m, 1)) rhs = np.hstack((rhs, avg_test_stats)) rhs = np.hstack((rhs, avg_test_stats**2.0)) rhs = np.hstack((rhs, avg_test_stats**3.0)) rhs_large = rhs res_large = WLS(lhs, rhs, weights=1.0 / avg_test_std).fit() dfgls_large_p[t] = res_large.params # Compute tau_max, by finding the func maximum p = res_large.params poly_roots = np.roots(np.array([3, 2, 1.0]) * p[:0:-1]) dfgls_tau_max[t] = float(np.squeeze(np.real(np.max(poly_roots)))) # Small p regression using only p<=15% cutoff = np.where(percentiles <= 15.0)[0] avg_test_stats = results[cutoff, 1, :].mean(axis=1) avg_test_std = results[cutoff, 1, :].std(axis=1) avg_test_stats = avg_test_stats[:, None] lhs = lhs[cutoff] m = lhs.shape[0] rhs = np.ones((m, 1)) rhs = np.hstack((rhs, avg_test_stats))
def fit(self, x, y1, y2, cens, w, verbose=False): """ Fit a maximum-likelihood Tobit regression :param x: Pandas DataFrame (n_samples, n_features): Data :param y: Pandas Series (n_samples,): Target :param cens: Pandas Series (n_samples,): -1 indicates left-censored samples, 0 for uncensored, 1 for right-censored :param verbose: boolean, show info from minimization :return: """ x_copy = x.copy() if self.fit_intercept: x_copy = np.insert(x_copy, 0, 1, axis=1) else: x_copy = skl.scale(x_copy, with_mean=True, with_std=False, copy=False) ## qui gen double `z' = cond(`y1'<.&`y2'<.,(`y1'+`y2')/2, /* ## */ cond(`y1'<.,`y1',`y2')) `moff' if `doit' y = [] counts = cens.value_counts() for value in [-1, 0, 1]: if value in counts: if value == -1: split = cens == value y_l = np.squeeze(y2[split].values) y.append(y_l) elif value == 1: split = cens == value y_r = np.squeeze(y1[split].values) y.append(y_r) elif value == 0: split = cens == value y_int = np.squeeze( (y1[split].values + y2[split].values) / 2) y.append(y_int) y = np.concatenate(y, axis=0) init_reg = WLS(y, x_copy, weights=w).fit() b0 = init_reg.params print(b0) y_pred = init_reg.predict(x_copy) resid = y - y_pred resid_var = np.var(resid) s0 = np.sqrt(resid_var) params0 = np.append(b0, s0) xs, ys, ys1, ys2, ws = split_left_right_censored( x_copy, y1, y2, cens, w) result = minimize(lambda params: tobit_neg_log_likelihood( xs, ys, ys1, ys2, ws, params), params0, jac=None, method='Powell', tol=0.000001, options={ 'disp': verbose, 'maxiter': 10000000, 'fatol': 0.00000001 }) if verbose: print(result) # self.ols_coef_ = b0[1:] # self.ols_intercept = b0[0] if self.fit_intercept: self.intercept_ = result.x[0] self.coef_ = result.x[1:-1] else: self.coef_ = result.x[:-1] self.intercept_ = 0 self.sigma_ = result.x[-1] return self