def test_manova_no_formula_no_hypothesis(): # Same as previous test only skipping formula interface exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True)) endog = X[['Basal', 'Occ', 'Max']] mod = MANOVA(endog, exog) r = mod.mv_test() assert isinstance(r, MultivariateTestResults)
def __init__(self): data = heart.load() endog = np.log10(data.endog) exog = add_constant(data.exog) self.mod1 = emplikeAFT(endog, exog, data.censors) self.res1 = self.mod1.fit() self.res2 = AFTRes()
def test_forecast(self): end = len(self.true['data']['consump'])+15-1 exog = add_constant(self.true['forecast_data']['m2']) assert_almost_equal( self.result.predict(end=end, exog=exog)[0], self.true['forecast'], 3 )
def test_multiple_constraints(): endog = dta['infl'] exog = add_constant(dta[['m1', 'unemp', 'cpi']]) constraints = [ 'm1 + unemp = 1', 'cpi = 0', ] mod = RecursiveLS(endog, exog, constraints=constraints) res = mod.fit() # See tests/results/test_rls.do desired = [-0.7001083844336, -0.0018477514060, 1.0018477514060, 0] assert_allclose(res.params, desired, atol=1e-10) # See tests/results/test_rls.do desired = [.4699552366, .0005369357, .0005369357, 0] assert_allclose(res.bse[0], desired[0], atol=1e-1) assert_allclose(res.bse[1:-1], desired[1:-1], atol=1e-4) # See tests/results/test_rls.do desired = -534.4292052931121 # Note that to compute what Stata reports as the llf, we need to use a # different denominator for estimating the scale, and then compute the # llf from the alternative recursive residuals scale_alternative = np.sum(( res.standardized_forecasts_error[0, 1:] * res.filter_results.obs_cov[0, 0]**0.5)**2) / mod.nobs llf_alternative = np.log(norm.pdf(res.resid_recursive, loc=0, scale=scale_alternative**0.5)).sum() assert_allclose(llf_alternative, desired)
def setup_class(cls): data = heart.load() endog = np.log10(data.endog) exog = add_constant(data.exog) cls.mod1 = emplikeAFT(endog, exog, data.censors) cls.res1 = cls.mod1.fit() cls.res2 = AFTRes()
def test_manova_no_formula(): # Same as previous test only skipping formula interface exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True)) endog = X[['Basal', 'Occ', 'Max']] mod = MANOVA(endog, exog) intercept = np.zeros((1, 3)) intercept[0, 0] = 1 loc = np.zeros((2, 3)) loc[0, 1] = loc[1, 2] = 1 hypotheses = [('Intercept', intercept), ('Loc', loc)] r = mod.mv_test(hypotheses) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'], 0.60143661, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'], 0.44702843, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Value'], 0.58210348, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'], 0.35530890, decimal=8) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'], 0.77, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'], 0.86, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'F Value'], 0.75, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'], 1.07, decimal=2) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Num DF'], 6, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'], 3, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'], 16, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'], 18, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Den DF'], 9.0909, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'], 9, decimal=3) assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'], 0.6032, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'], 0.5397, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Pr > F'], 0.6272, decimal=4) assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'], 0.4109, decimal=4)
def test_plots(): if not have_matplotlib: raise SkipTest exog = add_constant(dta[['m1', 'pop']]) mod = RecursiveLS(endog, exog) res = mod.fit() # Basic plot fig = res.plot_recursive_coefficient() plt.close(fig) # Specific variable fig = res.plot_recursive_coefficient(variables=['m1']) plt.close(fig) # All variables fig = res.plot_recursive_coefficient(variables=[0, 'm1', 'pop']) plt.close(fig) # Basic plot fig = res.plot_cusum() plt.close(fig) # Other alphas for alpha in [0.01, 0.10]: fig = res.plot_cusum(alpha=alpha) plt.close(fig) # Invalid alpha assert_raises(ValueError, res.plot_cusum, alpha=0.123) # Basic plot fig = res.plot_cusum_squares() plt.close(fig) # Numpy input (no dates) mod = RecursiveLS(endog.values, exog.values) res = mod.fit() # Basic plot fig = res.plot_recursive_coefficient() plt.close(fig) # Basic plot fig = res.plot_cusum() plt.close(fig) # Basic plot fig = res.plot_cusum_squares() plt.close(fig)
def __init__(self): # Remove the regression coefficients from the parameters, since they # will be estimated as part of the state vector true = dict(results_sarimax.friedman2_mle) exog = add_constant(true['data']['m2']) / 10. true['mle_params_exog'] = true['params_exog'][:] true['mle_se_exog'] = true['se_exog_oim'][:] true['params_exog'] = [] true['se_exog'] = [] super(TestFriedmanStateRegression, self).__init__( true, exog=exog, mle_regression=False ) self.result = self.model.filter()
def __init__(self, true, exog=None, *args, **kwargs): self.true = true endog = np.r_[true['data']['consump']] if exog is None: exog = add_constant(true['data']['m2']) kwargs.setdefault('simple_differencing', True) kwargs.setdefault('hamilton_representation', True) self.model = sarimax.SARIMAX( endog, exog=exog, order=(1, 0, 1), *args, **kwargs ) params = np.r_[true['params_exog'], true['params_ar'], true['params_ma'], true['params_variance']] self.model.update(params)
def setup_class(cls): path = os.path.join(current_path, 'results', 'mar_filardo.csv') cls.mar_filardo = pd.read_csv(path) true = { 'params': np.r_[4.35941747, -1.6493936, 1.7702123, 0.9945672, 0.517298, -0.865888, np.exp(-0.362469)**2, 0.189474, 0.079344, 0.110944, 0.122251], 'llf': -586.5718, 'llf_fit': -586.5718, 'llf_fit_em': -586.5718 } endog = cls.mar_filardo['dlip'].iloc[1:].values exog_tvtp = add_constant( cls.mar_filardo['dmdlleading'].iloc[:-1].values) super(TestFilardo, cls).setup_class( true, endog, k_regimes=2, order=4, switching_ar=False, exog_tvtp=exog_tvtp)
def wls(data, use_bayes=False): """ Weighted least squares for peptides in protein. Operates on sub data frames """ # Degenerate case, only one peptide if data.shape[0] == 1: return wls_degenerate(data) y = data['meanC'].append(data['meanE']).values if use_bayes: w = data['bayesSDC'].append(data['bayesSDE']).values**2 else: w = data['stdC'].append(data['stdE']).values**2 x = np.ones(data.shape[0]*2) x[:data.shape[0]] = 0 mod_wls = sm.WLS(y, add_constant(x, prepend=False), weights=1./w) res_wls = mod_wls.fit() return (res_wls.params[0], res_wls.bse[0], res_wls.pvalues[0])
def _check_constant_params(a, has_const=False, use_const=True, rtol=1e-05, atol=1e-08): """Helper func to interaction between has_const and use_const params. has_const use_const outcome --------- --------- ------- True True Confirm that a has constant; return a False False Confirm that a doesn't have constant; return a False True Confirm that a doesn't have constant; add constant True False ValueError """ if all((has_const, use_const)): if not _confirm_constant(a): raise ValueError('Data does not contain a constant; specify' ' has_const=False') k = a.shape[-1] - 1 elif not any((has_const, use_const)): if _confirm_constant(a): raise ValueError('Data already contains a constant; specify' ' has_const=True') k = a.shape[-1] elif not has_const and use_const: # Also run a quick check to confirm that `a` is *not* ~N(0,1). # In this case, constant should be zero. (exclude it entirely) c1 = np.allclose(a.mean(axis=0), b=0., rtol=rtol, atol=atol) c2 = np.allclose(a.std(axis=0), b=1., rtol=rtol, atol=atol) if c1 and c2: # TODO: maybe we want to just warn here? raise ValueError('Data appears to be ~N(0,1). Specify' ' use_constant=False.') # `has_constant` does checking on its own and raises VE if True try: a = add_constant(a, has_constant='raise') except ValueError as e: raise ValueError( 'X data already contains a constant; please specify' ' has_const=True' ) from e k = a.shape[-1] - 1 else: raise ValueError('`use_const` == False implies has_const is False.') return k, a
def test_plots(close_figures): exog = add_constant(dta[['m1', 'pop']]) mod = RecursiveLS(endog, exog) res = mod.fit() # Basic plot try: from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() except ImportError: pass fig = res.plot_recursive_coefficient() # Specific variable fig = res.plot_recursive_coefficient(variables=['m1']) # All variables fig = res.plot_recursive_coefficient(variables=[0, 'm1', 'pop']) # Basic plot fig = res.plot_cusum() # Other alphas for alpha in [0.01, 0.10]: fig = res.plot_cusum(alpha=alpha) # Invalid alpha assert_raises(ValueError, res.plot_cusum, alpha=0.123) # Basic plot fig = res.plot_cusum_squares() # Numpy input (no dates) mod = RecursiveLS(endog.values, exog.values) res = mod.fit() # Basic plot fig = res.plot_recursive_coefficient() # Basic plot fig = res.plot_cusum() # Basic plot fig = res.plot_cusum_squares()
def trend(x, time='time', detrend=False): """returns trend per year""" from statsmodels.tools import add_constant year = 3600 * 24 * 365.24 # slope is w.r.t. seconds t = add_constant(x[time].values.astype('datetime64[s]').astype(float)) lsq = np.linalg.lstsq(t, x.squeeze())[0] coords = [ c for c in set(x.coords) - {time} if (x.coords[c].shape != ()) and (len(x.coords[c]) > 1) ] if len(coords) == 1: if detrend: return xr.DataArray(x - t.dot(lsq), coords=[x.time, x.coords[coords[0]]]) return xr.DataArray(lsq[1, :], coords=[x.coords[coords[0]]]) * year elif len(coords) == 0: return x - t.dot(lsq) if detrend else lsq[1] * year else: raise Exception('more than one additional coordinate')
def backwardElimination(input_matrix, output_array, significance_level=0.05): data = add_constant(input_matrix) candidate_variables = list(data.columns) # >1 because we've added a 'const' column while len(candidate_variables) > 1: data = data.loc[:, candidate_variables] regressor = sm.OLS(endog=output_array, exog=data).fit() worst_index, p_value = max(enumerate(regressor.pvalues), key=itemgetter(1)) if p_value > significance_level: print(f"Eliminating '{candidate_variables[worst_index]}' with p-value {p_value:.2}") del candidate_variables[worst_index] else: print(f"Final variable selection: {candidate_variables[1:]}") print(regressor.summary()) return data.loc[:, candidate_variables[1:]] print("No significant correlation found for any variables") return None
def computeForDay(self, strategy, timeSeriesTick, timeSeriesTrade): timeSeriesReg = timeSeriesTick.resample( str(int(self.resamplePeriod)) + "S" ).first() timeSeriesReg = timeSeriesReg.fillna(method="pad") timeTable = timeSeriesReg.to_frame() timeTable["second"] = timeSeriesReg.index.astype(np.int64) timeTable["second"] = (timeTable["second"] - timeTable["second"][0]) / math.pow( 10, 9 ) # self.betaSeries = pd.stats.ols.MovingOLS(y=timeTable['price'], x=timeTable['second'], window_type='rolling', window = self.period, intercept=True).beta mod = RollingOLS( timeTable["price"], add_constant(timeTable["second"], prepend=False), window=self.period, ) self.betaSeries = mod.fit().params return {"betaSeries": self.betaSeries}
def ols_fit_train(y_array,df,col_list): ''' Takes df and string name of y column name in df, uses patsy_input_str to create string and inputs into feature matrix. Outputs OLS fit summary. ''' input_string = patsy_input_str(df,y_col_name) # Create your feature matrix (X) and target vector (y) y, X = patsy.dmatrices(input_string, data=df, return_type="dataframe") #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=10) # Create your model model = sm.OLS(y_array, add_constant(df.loc[:,col_list])) # Fit your model to your training set fit = model.fit() # Print summary statistics of the model's performance return fit.summary()
def basic_logistic_regression(df, cutoff, col='pop_bin', rand=0, sig_only=False): df = df.copy() X, y = return_X_y_logistic(split_sample_combine(df, cutoff, col, rand=rand)) X = standardize_X(X) X_const = add_constant(X, prepend=True) print("X_const\n", X_const) print("Y\n", y) logit_model = Logit(y, X_const).fit(solver='lbfgs', skip_hessian=True, max_iter=20000) print(logit_model.summary()) return logit_model
def linear_regression(Y, X, multiple_X=1, fix_nan=True, alfa=False, integrate=False): """ Using a package, it rebuild here for ease of use. It is 100 % statsmodels OLS. Y(list[float]) - dependent variable; X([list[float] | float]) - independent variable; multiple_X [int, default=1] if there are mutiple factors (Xes), set the number of factor columns """ Y = np.array(Y).reshape(-1, 1) X = np.array(X).reshape(-1, multiple_X) if integrate: integrations, Y, X = data_tests.stationarity.forceSTATxy(Y, X) if alfa: X = add_constant(X) model = OLS(Y, X, missing='drop' if fix_nan else 'none').fit() return model if not integrate else (model, integrations)
def compute_variance_inflation_factor(X: np.array) -> np.array: """ Compute the variance inflation factor for each features of a matrix Parameters ---------- {X} Returns ------- vif: np.array of shape = (n_features) variance inflation factor of each features of the input matrix """ # Add a constant columns as suggest here: # https://stackoverflow.com/questions/42658379/variance-inflation-factor-in-python X = add_constant(X, prepend=True) vif = np.array([oi.variance_inflation_factor(X, j) for j in range(X.shape[1])]) # remove the firs element corresponding to the constant col vif = np.delete(vif, 0) vif[np.isnan(vif)] = np.inf return vif
def fit(self, X, y, force_include_idx=None): ''' Estimate a model using Post-Lasso X: X matrix (without intercept) y: y vector force_include_idx: column indexes that ALWAYS is included in the OLS model, regardless of their status in the lasso stage. ''' self.lasso_model = self.lasso_model.fit(X, y) self.coefs = np.insert( self.lasso_model.coef_, 0, self.lasso_model.intercept_) # inserts intercepts in the first col self.subset_cols = np.where(self.coefs != 0)[ 0] # select variables for which the coef after lasso is not zero if force_include_idx is not None: # add cols defined in force_include_idx to subset_cols self.subset_cols = np.union1d(self.subset_cols, force_include_idx) self.relevant_x = add_constant( X )[:, self. subset_cols] # add constant to X and choose only the subset cols self.ols_model = OLS(y, self.relevant_x).fit() return self
def variance_inflation_factors(exog_df): ''' Parameters ---------- exog_df : dataframe, (nobs, k_vars) design matrix with all explanatory variables, as for example used in regression. Returns ------- vif : Series variance inflation factors ''' exog_df = add_constant(exog_df) vifs = pd.Series([ 1 / (1. - OLS(exog_df[col].values, exog_df.loc[:, exog_df.columns != col].values).fit().rsquared) for col in exog_df ], index=exog_df.columns, name='VIF') return vifs
def gen_data(nobs, nvar, const, pandas=False, missing=0.0, weights=False): rs = np.random.RandomState(987499302) x = rs.standard_normal((nobs, nvar)) cols = ["x{0}".format(i) for i in range(nvar)] if const: x = tools.add_constant(x) cols = ["const"] + cols if missing > 0.0: mask = rs.random_sample(x.shape) < missing x[mask] = np.nan if x.shape[1] > 1: y = x[:, :-1].sum(1) + rs.standard_normal(nobs) else: y = x.sum(1) + rs.standard_normal(nobs) w = rs.chisquare(5, y.shape[0]) / 5 if pandas: idx = pd.date_range("12-31-1999", periods=nobs) x = pd.DataFrame(x, index=idx, columns=cols) y = pd.Series(y, index=idx, name="y") w = pd.Series(w, index=idx, name="weights") if not weights: w = None return y, x, w
def calculate_QQplot(data1, data2, a=0): def _sample_quantiles(data): probplot = gofplots.ProbPlot(np.array(data, dtype=float), a=a) return probplot.sample_quantiles def _match_quantile_probabilities(quantiles1, quantiles2): if len(quantiles1) > len(quantiles2): quantiles2, quantiles1 = _match_quantile_probabilities( quantiles2, quantiles1) else: N_obs = len(quantiles1) probs = gofplots.plotting_pos(N_obs, a) quantiles2 = scstats.mstats.mquantiles(quantiles2, probs) return quantiles1, quantiles2 s1, s2 = _sample_quantiles(data1), _sample_quantiles(data2) s1, s2 = _match_quantile_probabilities(s1, s2) linreg_result = OLS(s2, add_constant(s1)).fit() s2_fitted = linreg_result.fittedvalues r = np.sqrt(linreg_result.rsquared) return s1, s2, s2_fitted, r
def glm_regularized_AIC(X, Y, reg_mod, unreg_mod, tol=1e-6, method="kawano", family="binomial"): """ Calculate AIC for a Generalized Linear Model with regularization. See 'AIC for the Lasso in GLMs', Y. Ninomiya and S. Kawano (2016) Parameters ---------- X : numpy array or pandas dataframe Feature or design matrix. Y : numpy array or pandas series Target or response variable. reg_mod : sklearn, or similar The regularized model. unreg_mod : sklearn, or similar The unregularized model. tol : float, optional Tolerance cutoff for counting non-zero coefficients. The default is 1e-6. method : str, optional The method for calculating the AIC. Either `kawano` or `Hastie`. The default is "kawano". family : str, optional The type of generalised linear model. The default is "binomial". Raises ------ ValueError Raised if an invalid family is picked. Returns ------- aic : float The calculated AIC. """ # requires predict_proba method for logreg, and predict method for others, for poisson, predict output should be lambda, i.e. it should already be exponentiated if isinstance(X, pd.DataFrame): X = X.values if isinstance(Y, pd.DataFrame or pd.Series): Y = Y.values aic = None if family == "binomial": nllf = glm_likelihood_bernoulli unreg_prob = unreg_mod.predict_proba(X)[:, 1] reg_prob = reg_mod.predict_proba(X)[:, 1] y_mat_unreg = np.diag(unreg_prob * (1 - unreg_prob)) y_mat_reg = np.diag(reg_prob * (1 - reg_prob)) elif family == "poisson": nllf = glm_likelihood_poisson unreg_pred = unreg_mod.predict(X) reg_pred = reg_mod.predict(X) y_mat_unreg = np.diag(unreg_prob) y_mat_reg = np.diag(reg_prob) elif family == "gaussian": nllf = glm_likelihood_gaussian unreg_pred = unreg_mod.predict(X) reg_pred = reg_mod.predict(X) y_mat_unreg = np.diag(unreg_pred) y_mat_reg = np.diag(reg_pred) else: raise ValueError("Not a valid family") reg_mod_coef = np.concatenate( (reg_mod.intercept_, np.squeeze(reg_mod.coef_))) nonzero_idx = np.where( [True if np.abs(coef) > tol else False for coef in reg_mod_coef])[0] if method == "kawano": X_nz = add_constant(X[:, nonzero_idx], prepend=True) j22 = np.linalg.multi_dot([X_nz.T, y_mat_reg, X_nz]) j22_2 = np.linalg.multi_dot([X_nz.T, y_mat_unreg, X_nz]) negloglike = nllf(reg_mod_coef, X, Y, lamb=0, l_norm=0) aic = 2 * negloglike + np.sum(np.diag(np.linalg.inv(j22).dot(j22_2))) else: # Tibshirani, Hastie, Zou 2007, On the degrees of freedom on the lasso negloglike = nllf(reg_mod_coef, X, Y, lamb=0, l_norm=0) # Not 100% sure on this calculation. should it be 2*len(nonzero_idx), #the count of nonzero columns, or 2*mean(non_zero_idx)? aic = 2 * negloglike + 2 * len(nonzero_idx) return aic
if len(df) == 0: model = irimodel else: pred = irimodel.predict(df['station.longitude'].values, df['station.latitude'].values) error = pred - df[metric].values print(df[metric].values) print(pred) print(error) print(np.sqrt(np.sum(error**2) / np.sum(df.cs.values)), np.sum(error) / np.sum(df.cs.values)) if metric in ['mufd', 'fof2']: wls_model = sm.WLS(df[metric].values - pred, add_constant(pred, prepend=False), df.cs.values) wls_fit = wls_model.fit_regularized(alpha=np.array([1, 3]), L1_wt=0) coeff = wls_fit.params coeff[0] = coeff[0] + 1 print(coeff) irimodel = LinearModel(irimodel, coeff[0], coeff[1]) pred = irimodel.predict(df['station.longitude'].values, df['station.latitude'].values) error = pred - df[metric].values print(df[metric].values) print(pred) print(error) print(np.sqrt(np.sum(error**2) / np.sum(df.cs.values)), np.sum(error) / np.sum(df.cs.values))
def test_glm(constraints=None): # More comprehensive tests against GLM estimates (this is sort of redundant # given `test_ols`, but this is mostly to complement the tests in # `test_glm_constrained`) endog = dta.infl exog = add_constant(dta[['unemp', 'm1']]) mod = RecursiveLS(endog, exog, constraints=constraints) res = mod.fit() mod_glm = GLM(endog, exog) if constraints is None: res_glm = mod_glm.fit() else: res_glm = mod_glm.fit_constrained(constraints=constraints) # Regression coefficients, standard errors, and estimated scale assert_allclose(res.params, res_glm.params) assert_allclose(res.bse, res_glm.bse, atol=1e-6) # Note: scale here is computed according to Harvey, 1989, 4.2.5, and is # the called the ML estimator and sometimes (e.g. later in section 5) # denoted \tilde \sigma_*^2 assert_allclose(res.filter_results.obs_cov[0, 0], res_glm.scale) # DoF # Note: GLM does not include intercept in DoF, so modify by -1 assert_equal(res.df_model - 1, res_glm.df_model) # OLS residuals are equivalent to smoothed forecast errors # (the latter are defined as e_t|T by Harvey, 1989, 5.4.5) # (this follows since the smoothed state simply contains the # full-information estimates of the regression coefficients) actual = (mod.endog[:, 0] - np.sum(mod['design', 0, :, :] * res.smoothed_state, axis=0)) assert_allclose(actual, res_glm.resid_response, atol=1e-7) # Given the estimate of scale as `sum(v_t^2 / f_t) / (T - d)` (see # Harvey, 1989, 4.2.5 on p. 183), then llf_recursive is equivalent to the # full OLS loglikelihood (i.e. without the scale concentrated out). desired = mod_glm.loglike(res_glm.params, scale=res_glm.scale) assert_allclose(res.llf_recursive, desired) # Alternatively, we can construct the concentrated OLS loglikelihood # by computing the scale term with `nobs` in the denominator rather than # `nobs - d`. scale_alternative = np.sum( (res.standardized_forecasts_error[0, 1:] * res.filter_results.obs_cov[0, 0]**0.5)**2) / mod.nobs llf_alternative = np.log( norm.pdf(res.resid_recursive, loc=0, scale=scale_alternative**0.5)).sum() assert_allclose(llf_alternative, res_glm.llf) # Prediction # TODO: prediction in this case is not working. if constraints is None: design = np.ones((1, 3, 10)) actual = res.forecast(10, design=design) assert_allclose(actual, res_glm.predict(np.ones((10, 3)))) else: design = np.ones((2, 3, 10)) assert_raises(NotImplementedError, res.forecast, 10, design=design) # Hypothesis tests actual = res.t_test('m1 = 0') desired = res_glm.t_test('m1 = 0') assert_allclose(actual.statistic, desired.statistic) assert_allclose(actual.pvalue, desired.pvalue, atol=1e-15) actual = res.f_test('m1 = 0') desired = res_glm.f_test('m1 = 0') assert_allclose(actual.statistic, desired.statistic) assert_allclose(actual.pvalue, desired.pvalue) # Information criteria # Note: the llf and llf_obs given in the results are based on the Kalman # filter and so the ic given in results will not be identical to the # OLS versions. Additionally, llf_recursive is comparable to the # non-concentrated llf, and not the concentrated llf that is by default # used in OLS. Compute new ic based on llf_alternative to compare. actual_aic = aic(llf_alternative, res.nobs_effective, res.df_model) assert_allclose(actual_aic, res_glm.aic)
except ImportError: have_matplotlib = False current_path = os.path.dirname(os.path.abspath(__file__)) results_R_path = 'results' + os.sep + 'results_rls_R.csv' results_R = pd.read_csv(current_path + os.sep + results_R_path) results_stata_path = 'results' + os.sep + 'results_rls_stata.csv' results_stata = pd.read_csv(current_path + os.sep + results_stata_path) dta = macrodata.load_pandas().data dta.index = pd.date_range(start='1959-01-01', end='2009-07-01', freq='QS') endog = dta['cpi'] exog = add_constant(dta['m1']) def test_endog(): # Tests for numpy input mod = RecursiveLS(endog.values, exog.values) res = mod.fit() # Test the RLS estimates against OLS estimates mod_ols = OLS(endog, exog) res_ols = mod_ols.fit() assert_allclose(res.params, res_ols.params) # Tests for 1-dim exog mod = RecursiveLS(endog, dta['m1'].values) res = mod.fit()
ax.set_xlabel('YEAR') ax.set_ylabel('DEC') plt.show() from sklearn import linear_model, feature_selection, preprocessing from sklearn.model_selection import train_test_split import statsmodels.formula.api as sm from statsmodels.tools.eval_measures import mse from statsmodels.tools import add_constant from sklearn.metrics import mean_squared_error X = df.values.copy() X_train, X_valid, y_train, y_valid = train_test_split(X[:, :-1], X[:, -1], train_size=0.80) result = sm.OLS(y_train, add_constant(X_train)).fit() result.summary() result = sm.OLS(y_train, add_constant(X_train)).fit() result.summary() ypred = result.predict(add_constant(X_valid)) print(mse(ypred, y_valid)) fig, ax = plt.subplots(1, 1) ax.scatter(y_valid, ypred) ax.set_xlabel('Actual') ax.set_ylabel('Prediction') plt.show() # In[ ]: # In[ ]:
square = lambda row: row**2 sum_of_squares = df['difference'].apply(square).sum() return(sum_of_squares) x0 = [-20, .0008, 1.1] estimator(x0) optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True}) clf = linear_model.LinearRegression() x = df[['AADT', 'L']].as_matrix() y = df['Crashes'] clf.fit(x, y) clf.coef_ clf.intercept_ model = OLS(y, add_constant(x)) model_fit = model.fit() model_fit.summary() def estimator(x, row_in='Crashes'): estimated = lambda row: exp(x[0] + x[1] * row['AADT'] + x[2] * row['L']) df['estimated'] = df.apply(estimated, axis=1) #probability = lambda row: (row['estimated']**row[row_in] * exp(-row['estimated'])) / factorial(row[row_in]) probability = lambda row: poisson.pmf(row[row_in], row['estimated']) df['probability'] = df.apply(probability, axis=1) product = df['probability'].product() return(-product) x0 = [1.6, .0000026, .032] estimator(x0) optimize.minimize(estimator, x0, method='nelder-mead', options={'xtol': 1e-8, 'disp': True})
plt.show() # 1.4 df.hist() plt.show() # Part 2 # 2.1 from statsmodels.discrete.discrete_model import Logit from statsmodels.tools import add_constant X = df[['gre', 'gpa', 'rank']].values X_const = add_constant(X, prepend=True) y = df['admit'].values logit_model = Logit(y, X_const).fit() # 2.2 logit_model.summary() # 2.3 import numpy as np from sklearn.cross_validation import KFold from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score
from statsmodels.tools import add_constant from numpy.testing import assert_equal, assert_raises, assert_allclose current_path = os.path.dirname(os.path.abspath(__file__)) results_R_path = 'results' + os.sep + 'results_rls_R.csv' results_R = pd.read_csv(current_path + os.sep + results_R_path) results_stata_path = 'results' + os.sep + 'results_rls_stata.csv' results_stata = pd.read_csv(current_path + os.sep + results_stata_path) dta = macrodata.load_pandas().data dta.index = pd.date_range(start='1959-01-01', end='2009-07-01', freq='QS') endog = dta['cpi'] exog = add_constant(dta['m1']) def test_endog(): # Tests for numpy input mod = RecursiveLS(endog.values, exog.values) res = mod.fit() # Test the RLS estimates against OLS estimates mod_ols = OLS(endog, exog) res_ols = mod_ols.fit() assert_allclose(res.params, res_ols.params) # Tests for 1-dim exog mod = RecursiveLS(endog, dta['m1'].values) res = mod.fit()
def __init__(self): data = stackloss.load() data.exog = add_constant(data.exog) self.res1 = OLS(data.endog, data.exog).fit() self.res2 = RegressionResults()
sn.countplot(x='TenYearCHD', data=heart_df) # There are 3179 patents with no heart disease and 572 patients with risk of heart disease. # In[21]: sn.pairplot(data=heart_df) # In[22]: heart_df.describe() # In[23]: from statsmodels.tools import add_constant as add_constant heart_df_constant = add_constant(heart_df) heart_df_constant.head() # In[24]: st.chisqprob = lambda chisq, df: st.chi2.sf(chisq, df) cols = heart_df_constant.columns[:-1] model = sm.Logit(heart_df.TenYearCHD, heart_df_constant[cols]) result = model.fit() result.summary() # In[43]: def back_feature_elem(data_frame, dep_var, col_list): """ Takes in the dataframe, the dependent variable and a list of column names, runs the regression repeatedly eleminating feature with the highest
# label encoding and one hot encoding categorical variables: Pclass, Sex, Embarked dataframe = pd.get_dummies(dataframe, columns=['Pclass', 'Sex'], drop_first=True) # for now feature scaling seems unnecessary, but we'll add it later if it turns out to be required # extract independent and dependent variable matrices X = dataframe.drop(labels=['PassengerId', 'Survived'], axis=1) y = dataframe._getitem_column('Survived') ## Backward Elimination # add a column of 1s to represent x0 variable (intercept) X = smtools.add_constant(X) # use Backward Elimination to get rid of insignificant variables significance_level = 0.05 X = toolkit.backward_elimination_using_pvalues(X, y, significance_level) # X = toolkit.backward_elimination_using_adjR2(X, y) # Fitting Decision Tree Classification to the Training set accuracies = {} std = {} classifier = DecisionTreeClassifier() accuracies['Decision Tree'], std['Decision Tree'] = classifier.classify(X, y) # Fitting Random Forest Classification to the Training set classifier = RandomForestClassifier() accuracies['Random Forest'], std['Random Forest'] = classifier.classify(X, y)
y = np.array([data['sales']]).reshape(-1, 1) # %% from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X, y) score = model.score(X, y) print(f"R2 Score: {score}") print('Weight coefficients: ', model.coef_) # %% # << Statsmodels >> # X need to add constant to match the results from scikit learn X1 = stat.add_constant(X) model = sm.OLS(y, X1) results = model.fit() print(results.summary()) # %% # Regression Model with Qualitative Predictors --------------- # load data path = '/Users/michaelshih/Documents/code/education/statistical_learining/' subfolder = 'resource' filename = 'Credit.csv' filedir = os.path.join(path, subfolder, filename) print(filedir) data = pd.read_csv(filedir, index_col=0)
def test_design(self): npt.assert_equal(self.model.exog, add_constant(self.data.exog, prepend=True))
def test_glm(constraints=None): # More comprehensive tests against GLM estimates (this is sort of redundant # given `test_ols`, but this is mostly to complement the tests in # `test_glm_constrained`) endog = dta.infl exog = add_constant(dta[['unemp', 'm1']]) mod = RecursiveLS(endog, exog, constraints=constraints) res = mod.fit() mod_glm = GLM(endog, exog) if constraints is None: res_glm = mod_glm.fit() else: res_glm = mod_glm.fit_constrained(constraints=constraints) # Regression coefficients, standard errors, and estimated scale assert_allclose(res.params, res_glm.params) assert_allclose(res.bse, res_glm.bse, atol=1e-6) # Note: scale here is computed according to Harvey, 1989, 4.2.5, and is # the called the ML estimator and sometimes (e.g. later in section 5) # denoted \tilde \sigma_*^2 assert_allclose(res.filter_results.obs_cov[0, 0], res_glm.scale) # DoF # Note: GLM does not include intercept in DoF, so modify by -1 assert_equal(res.df_model - 1, res_glm.df_model) # OLS residuals are equivalent to smoothed forecast errors # (the latter are defined as e_t|T by Harvey, 1989, 5.4.5) # (this follows since the smoothed state simply contains the # full-information estimates of the regression coefficients) actual = (mod.endog[:, 0] - np.sum(mod['design', 0, :, :] * res.smoothed_state, axis=0)) assert_allclose(actual, res_glm.resid_response, atol=1e-7) # Given the estimate of scale as `sum(v_t^2 / f_t) / (T - d)` (see # Harvey, 1989, 4.2.5 on p. 183), then llf_recursive is equivalent to the # full OLS loglikelihood (i.e. without the scale concentrated out). desired = mod_glm.loglike(res_glm.params, scale=res_glm.scale) assert_allclose(res.llf_recursive, desired) # Alternatively, we can construct the concentrated OLS loglikelihood # by computing the scale term with `nobs` in the denominator rather than # `nobs - d`. scale_alternative = np.sum(( res.standardized_forecasts_error[0, 1:] * res.filter_results.obs_cov[0, 0]**0.5)**2) / mod.nobs llf_alternative = np.log(norm.pdf(res.resid_recursive, loc=0, scale=scale_alternative**0.5)).sum() assert_allclose(llf_alternative, res_glm.llf) # Prediction # TODO: prediction in this case is not working. if constraints is None: design = np.ones((1, 3, 10)) actual = res.forecast(10, design=design) assert_allclose(actual, res_glm.predict(np.ones((10, 3)))) else: design = np.ones((2, 3, 10)) assert_raises(NotImplementedError, res.forecast, 10, design=design) # Hypothesis tests actual = res.t_test('m1 = 0') desired = res_glm.t_test('m1 = 0') assert_allclose(actual.statistic, desired.statistic) assert_allclose(actual.pvalue, desired.pvalue, atol=1e-15) actual = res.f_test('m1 = 0') desired = res_glm.f_test('m1 = 0') assert_allclose(actual.statistic, desired.statistic) assert_allclose(actual.pvalue, desired.pvalue) # Information criteria # Note: the llf and llf_obs given in the results are based on the Kalman # filter and so the ic given in results will not be identical to the # OLS versions. Additionally, llf_recursive is comparable to the # non-concentrated llf, and not the concentrated llf that is by default # used in OLS. Compute new ic based on llf_alternative to compare. actual_aic = aic(llf_alternative, res.nobs_effective, res.df_model) assert_allclose(actual_aic, res_glm.aic)
def _EM_test(self, nuisance_params, params=None, param_nums=None, b0_vals=None, F=None, survidx=None, uncens_nobs=None, numcensbelow=None, km=None, uncensored=None, censored=None, maxiter=None, ftol=None): """ Uses EM algorithm to compute the maximum likelihood of a test Parameters --------- Nuisance Params: array Vector of values to be used as nuisance params. maxiter: int Number of iterations in the EM algorithm for a parameter vector Returns ------- -2 ''*'' log likelihood ratio at hypothesized values and nuisance params Notes ----- Optional parameters are provided by the test_beta function. """ iters = 0 params[param_nums] = b0_vals nuis_param_index = np.int_(np.delete(np.arange(self.model.nvar), param_nums)) params[nuis_param_index] = nuisance_params to_test = params.reshape(self.model.nvar, 1) opt_res = np.inf diff = np.inf while iters < maxiter and diff > ftol: F = F.flatten() death = np.cumsum(F[::-1]) survivalprob = death[::-1] surv_point_mat = np.dot(F.reshape(-1, 1), 1. / survivalprob[survidx].reshape(1, - 1)) surv_point_mat = add_constant(surv_point_mat) summed_wts = np.cumsum(surv_point_mat, axis=1) wts = summed_wts[np.int_(np.arange(uncens_nobs)), numcensbelow[uncensored]] # ^E step # See Zhou 2005, section 3. self.model._fit_weights = wts new_opt_res = self._opt_wtd_nuis_regress(to_test) # ^ Uncensored weights' contribution to likelihood value. F = self.new_weights # ^ M step diff = np.abs(new_opt_res - opt_res) opt_res = new_opt_res iters = iters + 1 death = np.cumsum(F.flatten()[::-1]) survivalprob = death[::-1] llike = -opt_res + np.sum(np.log(survivalprob[survidx])) wtd_km = km.flatten() / np.sum(km) survivalmax = np.cumsum(wtd_km[::-1])[::-1] llikemax = np.sum(np.log(wtd_km[uncensored])) + \ np.sum(np.log(survivalmax[censored])) if iters == maxiter: warnings.warn('The EM reached the maximum number of iterations', IterationLimitWarning) return -2 * (llike - llikemax)
def setup_class(cls): data = stackloss.load(as_pandas=False) data.exog = add_constant(data.exog) cls.res1 = OLS(data.endog, data.exog).fit() cls.res2 = RegressionResults()
def fitPoisson(X, Y): X = add_constant(X) return sm.GLM(Y, X, family=sm.families.Poisson()).fit(disp=0)
### accuracy of probabilistic predictions in a set of mutually ### exclusive outcomes i.e. default, non-default model_vars = [ 'term', 'home_ownership', 'grade', 'purpose', 'emp_length', ] continous_vars = ['funded_amnt', 'dti'] le = preprocessing.LabelEncoder() y = df_sample['default'].reset_index(drop=True) X = pd.DataFrame([]) for var in model_vars: X[var] = le.fit_transform(df_sample[var]) for i in continous_vars: X[i] = df_sample[i].reset_index(drop=True) #Add Constant X = smt.add_constant(X) # Regression Analysis logit_model = sm.Logit(y, X) result = logit_model.fit(disp=0) y_true = df_sample['default'] y_pred = result.predict() print(brier_score_loss(y_true, y_pred))
# VIF # The vif of each column is ok. All of them are smaller than 5, even 2. def variance_inflation_factor(exog, exog_idx): k_vars = exog.shape[1] x_i = exog.iloc[:, exog_idx] mask = np.arange(k_vars) != exog_idx x_noti = exog.iloc[:, mask] r_squared_i = OLS(x_i, x_noti).fit().rsquared vif = 1. / (1. - r_squared_i) return vif # VIF of each column # we skip the constant column VIF = [variance_inflation_factor(add_constant(X), i) for i in range(1,X.shape[1]+1)] regr_1 = OLS(y, add_constant(X)).fit() # residual distribution sns.distplot(regr_1.resid) # acting like normal which is good # since the residual itself is normal, box-cox is not necessary. # namda = 0.1 # regr_test = OLS((y**namda-1)/namda, add_constant(X)).fit() # sns.jointplot((y**namda-1)/namda, regr_test.resid) # sns.distplot(regr_test.resid) sns.jointplot(y, regr_1.resid) # which looks very strange. maybe the model is not linear at the first place. #since there is explicit non-linear in this model, we have to add some non-linear covariates in it.