def backward_elimination(X_opt, y, significant_level): import statsmodels.regression.linear_model as sm num_var = len(X_opt[0]) while (num_var > 0): regressor = sm.OLS(y, X_opt).fit() max_Pvalue = max(regressor.pvalues).astype(float) if (max_Pvalue > significant_level): real_num_var = num_var for i in range(num_var): prev_X_opt = X_opt cur_adjusted_rvalue = regressor.rsquared_adj if (regressor.pvalues[i] == max_Pvalue): X_opt = np.delete(X_opt, i, 1) temp_regressor = sm.OLS(y, X_opt).fit() new_adjusted_rvalue = temp_regressor.rsquared_adj if (new_adjusted_rvalue < cur_adjusted_rvalue): return prev_X_opt real_num_var -= 1 num_var = real_num_var else: break return X_opt
def backwards_elimination(X, y, sl): #print("\n\n\n") X = np.append( arr=np.ones((50, 1)).astype(int), values=X, axis=1 ) #adding x0=1 for coeff b0 in the multiple linear regression equation #X_opt=X[:,[0,3]] #starting with all independent variables, eliminating ivs one by one X_opt = X SL = 0.05 maxP = 0 rows, cols = X_opt.shape #print("\t\tX_opt\n\n\n",X_opt) for i in range(0, cols): #print("\n\n") regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() p_len = len(regressor_OLS.pvalues) #print("pvalue array is = ",regressor_OLS.pvalues) #print("length of p value array is ",p_len) maxP = find_max(regressor_OLS.pvalues, p_len) #maxP = max(regressor_OLS.pvalues).astype(float) if (maxP > SL): #print("inside mxp>sl") for j in range(0, cols - i): #print("inside j loop") if (regressor_OLS.pvalues[j].astype(float) == maxP): #print("inside main condition") #print("print col to be deleted is ",X_opt[:,j]) X_opt = np.delete(X_opt, j, 1) #print("\n\nfinal X_opt is \n",X_opt) regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit() return regressor_OLS
def vif_sub(x, regressors): x_i = regressors.iloc[:, x] mask = np.arange(k) != x x_not_i = regressors.iloc[:, mask] rsq = linear_model.OLS(x_i, x_not_i, missing="drop").fit().rsquared_adj vif = 1.0 / (1.0 - rsq) return vif
def backwardElimination( x, y, sl, con=True): # con implies if the constent matrix is added ao not import matplotlib.pyplot as plt import numpy as np import statsmodels.regression.linear_model as lmd if con == False: x = np.append(arr=np.ones((len(x), 1)).astype(int), values=x, axis=1) numVars = len(x[0]) r_sq = 0 for i in range(0, numVars): regressor_OLS = lmd.OLS(endog=y, exog=x).fit() if regressor_OLS.rsquared_adj > r_sq: # To see the Complete graph replace r_sq by 0 r_sq = regressor_OLS.rsquared_adj maxVar = max(regressor_OLS.pvalues).astype(float) if maxVar > sl: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): x = np.delete(x, j, 1) print(regressor_OLS.rsquared_adj) plt.scatter(i, regressor_OLS.rsquared_adj) plt.draw() plt.pause(0.2) plt.show() print(regressor_OLS.summary()) return x
def cluster_vs_meta_granger_TM(c, X, M, Ml, lags=7, thresh=0.05): # use the Toda Yamamoto method (environmental data is stationary, but clusters are not) x1 = X[c].sum(0) adf = stattools.adfuller(x1, maxlag=lags) if (adf[0] > adf[4]['5%']): m1 = adf[2] else: m1 = 0 R = [] for j, x2 in enumerate(M): have_values = np.isfinite(x2) xi = x1[have_values] x2i = x2[have_values] adf = stattools.adfuller(x2i, maxlag=lags) if (adf[0] > adf[4]['5%']): m2 = adf[2] else: m2 = 0 m = max(m1, m2) y = [ xi[i + max(0, m2 - m1):len(xi) + i - (m1 + lags)] for i in range(m1 + lags) ] + [ x2i[i + max(0, m1 - m2):len(xi) + i - (m2 + lags)] for i in range(m2 + lags) ] y = np.array(y).T lm = linear_model.OLS(xi[max(m1, m2) + lags:], y) result = lm.fit() Restr = np.eye(y.shape[1])[m + lags:] wald = result.wald_test(Restr) if wald.pvalue < thresh: R.append((wald.pvalue, Ml[j])) return m, sorted(R)
def _za_thread(x, regression, start, end, nobs, basecols, baselags, res, residx): # first-diff y dy = np.diff(x, axis=0)[:, 0] zastat = bpidx = np.inf for bp in range(start, end): # reserve exog space exog = np.zeros((dy[baselags:].shape[0], basecols + baselags)) # constant exog[:, 0] = 1 # intercept dummy / trend / trend dummy if regression != 't': exog[(bp - (baselags + 1)):, 1] = 1 exog[:, 2] = np.arange(baselags + 2, nobs + 1) if regression == 'ct': exog[(bp - (baselags + 1)):, 3] = np.arange(1, nobs - bp + 1) else: exog[:, 1] = np.arange(baselags + 2, nobs + 1) exog[(bp - (baselags + 1)):, 2] = np.arange(1, nobs - bp + 1) # lagged y exog[:, basecols - 1] = x[baselags:(nobs - 1), 0] # lagged dy exog[:, basecols:] = tsa.lagmat( dy, baselags, trim='none')[baselags:exog.shape[0] + baselags] stat = lm.OLS(dy[baselags:], exog).fit().tvalues[basecols - 1] if stat < zastat: zastat = stat bpidx = bp - 1 crit = zacrit.za_crit(zastat, regression) pval = crit[0] cvdict = crit[1] res[residx] = [zastat, pval, cvdict, bpidx]
def polynomial_regression() -> List[float]: """ Define the model directly through the design matrix. Similar to MATLAB's "regress" command. Returns ------- params : coefficients for the quadratic model """ # Generate the data: a noisy second order polynomial # To get reproducable values, I provide a seed value np.random.seed(987654321) t = np.arange(0, 10, 0.1) y = 4 + 3 * t + 2 * t**2 + 5 * np.random.randn(len(t)) # --- >>> START stats <<< --- # Make the fit. Note that this is another "OLS" than the one in # "model_formulas", as it works directly with the design matrix! M = np.column_stack((np.ones(len(t)), t, t**2)) res = sm.OLS(y, M).fit() # --- >>> STOP stats <<< --- # Display the results print('Summary:') print((res.summary())) print(('The fit parameters are: {0}'.format(str(res.params)))) print('The confidence intervals are:') print((res.conf_int())) return res.params # should be [ 4.74244177, 2.60675788, 2.03793634]
def test_arch_lm(simulated_data): zm = ZeroMean(simulated_data, volatility=GARCH()) res = zm.fit(disp=DISPLAY) wald = res.arch_lm_test() nobs = simulated_data.shape[0] df = int(np.ceil(12.0 * np.power(nobs / 100.0, 1 / 4.0))) assert wald.df == df assert "Standardized" not in wald.null assert "Standardized" not in wald.alternative assert "H0: Standardized" not in wald.__repr__() assert "heteroskedastic" in wald.__repr__() resids2 = pd.Series(res.resid ** 2) data = [resids2.shift(i) for i in range(df + 1)] data = pd.concat(data, 1).dropna() lhs = data.iloc[:, 0] rhs = smtools.add_constant(data.iloc[:, 1:]) ols_res = smlm.OLS(lhs, rhs).fit() assert_almost_equal(wald.stat, nobs * ols_res.rsquared) assert len(wald.critical_values) == 3 assert "10%" in wald.critical_values wald = res.arch_lm_test(lags=5) assert wald.df == 5 assert_almost_equal(wald.pval, 1 - stats.chi2(5).cdf(wald.stat)) wald = res.arch_lm_test(standardized=True) assert wald.df == df assert "Standardized" in wald.null assert "Standardized" in wald.alternative assert_almost_equal(wald.pval, 1 - stats.chi2(df).cdf(wald.stat)) assert "H0: Standardized" in wald.__repr__()
def cohort_1d_regressions(configs): for fpath in sorted( glob.glob(os.path.join(configs["metagroup_path"], "*.csv"))): groupname = fpath.split("/")[-1][:-4] print("Group: {}".format(groupname)) with open(fpath, 'r') as fp: csv_fp = csv.reader(fp, delimiter=',') next(csv_fp) x = [] y = [] for line in csv_fp: if line[0].strip( ) == "held_out" or line[1] == "" or line[2] == "": continue x.extend([1, 0]) y.extend([float(line[1]), float(line[2])]) lm = statslm.OLS(y, statstools.add_constant( np.array(x).reshape(len(x), 1)), missing='raise', hasconst=True) results = lm.fit() print( "Group result for group {} intercept/group effect: {}, t-test p-val: {}" .format(groupname, results.params, results.pvalues))
def interaction_effects(data, dep_variable, threshold): # remove date variable and booking/click depending on dep_variable data = data.iloc[:, 1:].copy() if dep_variable == "click_bool": data = data.drop("booking_bool", axis=1) elif dep_variable == "booking_bool": data = data.drop("click_bool", axis=1) # choose dependent variable X = data.drop(dep_variable, axis=1) y = data[dep_variable] # Generate interaction terms poly = PolynomialFeatures(interaction_only=True, include_bias=False) X_interaction = poly.fit_transform(X) # Get names of these terms names = PolynomialFeatureNames(poly.get_feature_names(), X) # Fit model to check importance of features model = linear_model.OLS(y, X_interaction).fit() # save results with open("output/interaction_effects/modelsummary.csv", "w") as f: f.write(model.summary().as_csv()) # show significant results results = pd.read_csv( "output/interaction_effects/modelsummary.csv", skiprows=10, skipfooter=10, index_col=0, ) results.index = names sign_results = results[results["P>|t| "] < threshold] print(sign_results.sort_values(by=" coef ", ascending=False))
def calculate(self, key, return_data=False): lob = self._lobagent.get(key).get_data() price = self._priceagent.get(key).get_data().price[0] lob_buy = lob.loc[lob.buysell == 'B', ['price', 'volume']].sort_values(by='price', ascending=False) lob_sell = lob.loc[lob.buysell == 'S', ['price', 'volume']].sort_values(by='price', ascending=True) tr_costs_sell = copy.copy(lob_buy) tr_costs_buy = copy.copy(lob_sell) tr_costs_sell['costs'] = np.cumsum( np.abs(tr_costs_sell.price - price) * tr_costs_sell.volume) tr_costs_sell.volume = np.cumsum(tr_costs_sell.volume) tr_costs_buy['costs'] = np.cumsum( np.abs(tr_costs_buy.price - price) * tr_costs_buy.volume) tr_costs_buy.volume = -np.cumsum(tr_costs_buy.volume) tr_costs = pd.concat( (tr_costs_buy, tr_costs_sell)).sort_values(by='volume').reset_index(drop=True) tr_costs = tr_costs[['volume', 'costs']] if return_data: return tr_costs try: DIV = DIVIDER[key[1]] except KeyError: DIV = 1 V = np.array(tr_costs.volume) / DIV costs = np.array(tr_costs.costs) y = costs X = np.stack([V, V**2, V**3, V**4], axis=1) model = linear_model.OLS(y, X) results = model.fit() params = np.array([results.params]) covparams = results.cov_params() column_names = self._transactioncoststable.get_column_names() key_names = self._transactioncoststable.get_key() row = pd.DataFrame(columns=column_names) row[key_names[0]] = [key[0]] row[key_names[1]] = [key[1]] row[key_names[2]] = [key[2]] row['params'] = [params.tolist()] row['cov_params'] = [covparams.tolist()] transactioncosts = TransactionCosts() transactioncosts.set_key(key) transactioncosts.set_data(row) return transactioncosts
def get_vif(exog): vif = {} for col in exog.columns: endog = exog.loc[:, col] exog_coef = exog.drop([col], axis=1) exog_coef = sm.add_constant(exog_coef) model = smlr.OLS(endog, exog_coef).fit() vif[col] = 1 / (1 - model.rsquared) return vif
def regression_affine(X, Y, details=True): Xreg = add_constant(X) #Add a constant to fit tan affine model model = linear_model.OLS(Y, Xreg) #Linear regression results = model.fit() [b, a] = results.params #parameters of the affine curve Yreg = a * X + b #regression curve return Yreg, a, b, results.summary()
def calculate_vif(data): vif_df = pd.DataFrame(columns=['Var', 'Vif']) x_var_names = data.columns for i in range(0, x_var_names.shape[0]): y = data[x_var_names[i]] x = data[x_var_names.drop([x_var_names[i]])] r_squared = sm.OLS(y, x).fit().rsquared vif = round(1 / (1 - r_squared), 2) vif_df.loc[i] = [x_var_names[i], vif] return vif_df.sort_values(by='Vif', axis=0, ascending=False, inplace=False)
def stepwise_selection(X, y, initial_list=[], threshold_in=0.01, threshold_out=0.05, verbose=True): """ Perform a forward-backward feature selection based on p-values""" included = list(initial_list) while True: changed = False # forward step excluded = list(set(X.columns) - set(included)) new_pval = pd.Series(index=excluded) for new_column in excluded: model = sm.OLS( y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit() new_pval[new_column] = model.pvalues[new_column] best_pval = new_pval.min() if best_pval < threshold_in: best_feature = new_pval.idxmin() included.append(best_feature) changed = True if verbose: print('Add {:30} with p-value {:.6}'.format( best_feature, best_pval)) # backward step model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit() # use all coefs except intercept pvalues = model.pvalues.iloc[1:] worst_pval = pvalues.max() # null if pvalues is empty if worst_pval > threshold_out: changed = True worst_feature = pvalues.argmax() included.remove(worst_feature) if verbose: print('Drop {:30} with p-value {:.6}'.format( worst_feature, worst_pval)) if not changed: break return included
def backwardElimination(X, sl): epochs = len(X[0]) for i in range(0, epochs): regressor_OLS = sm.OLS(y, X).fit() varWithMaxPValue = max(regressor_OLS.pvalues).astype(float) if varWithMaxPValue > sl: for j in range(0, epochs - 1): if (regressor_OLS.pvalues[j].astype(float) == varWithMaxPValue): X = np.delete(X, j, 1) regressor_OLS.summary() return X
def backwardElimination(x, sl): numVars = len(x[0]) for i in range(0, numVars): regressor_OLS = sm.OLS(y, x).fit() maxVar = max(regressor_OLS.pvalues).astype(float) if maxVar > sl: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): x = np.delete(x, j, 1) regressor_OLS.summary() return x
def CAPM(ticker, benchmark, start='2016-09-27'): import statsmodels.regression.linear_model as lm import statsmodels.tools.tools as ct df = make_portfolio([ticker, benchmark], start=start) ticker_df = df[ticker].pct_change()[1:] bench_df = df[benchmark].pct_change()[1:] const = ct.add_constant(bench_df) capm = lm.OLS(ticker_df, const) results = capm.fit() return results.summary()
def backwardElimination(x, sl): numVars = len(x[0]) for i in range(0, numVars): regressor_OLS = sm.OLS(y, x).fit() maxVar = max(regressor_OLS.pvalues).astype(float) if maxVar > sl: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): x = np.delete(x, j, 1) mdl = regressor_OLS.get_robustcov_results(cov_type='HAC', maxlags=1) print(mdl.summary()) return x
def get_regression(x, y) -> RegressionResults: y = y.values x = x.values x = add_constant(x) # intialize the model model = linear_model.OLS(y, x, missing='drop') # fit the regression fit_result = model.fit() return fit_result
def backwardelimination(x, sl): numvar = len(x[1]) for i in range(0, numvar): regressor_ols = lm.OLS(y, x).fit() maxpval = max(regressor_ols.pvalues).astype(float) if maxpval > sl: for j in range(0, numvar - i): if (regressor_ols.pvalues[j].astype(float) == maxpval): x = np.delete(x, j, 1) regressor_ols.summary() return x
def backwardElimination(x, SL): numVars = len(x[0]) temp = np.zeros((50,6)).astype(int) for i in range(0, numVars): regressor_OLS = sm.OLS(y, x.tolist()).fit() maxVar = max(regressor_OLS.pvalues).astype(float) adjR_before = regressor_OLS.rsquared_adj.astype(float) if maxVar > SL: for j in range(0, numVars - i): if (regressor_OLS.pvalues[j].astype(float) == maxVar): temp[:,j] = x[:, j] x = np.delete(x, j, 1) tmp_regressor = sm.OLS(y, x.tolist()).fit() adjR_after = tmp_regressor.rsquared_adj.astype(float) if (adjR_before >= adjR_after): x_rollback = np.hstack((x, temp[:,[0,j]])) x_rollback = np.delete(x_rollback, j, 1) print (regressor_OLS.summary()) return x_rollback else: continue regressor_OLS.summary() return x
def cohort_one_regression(configs): group_idxs = [] for idx, fpath in enumerate( sorted(glob.glob(os.path.join(configs["metagroup_path"], "*.csv")))): groupname = fpath.split("/")[-1][:-4] group_idxs.append((idx, groupname)) x = [] y = [] for fpath in sorted( glob.glob(os.path.join(configs["metagroup_path"], "*.csv"))): groupname = fpath.split("/")[-1][:-4] print("Group: {}".format(groupname)) with open(fpath, 'r') as fp: csv_fp = csv.reader(fp, delimiter=',') next(csv_fp) for line in csv_fp: if line[1] == "": continue x.append([(1 if group_idxs[idx][1] == groupname else 0) for idx in range(len(group_idxs))]) y.extend([float(line[1])]) y.append(0.9) y = np.array(y) x.append(np.zeros(len(group_idxs))) x = np.array(x) print("Size of design matrix: {}".format(x.shape)) lm = statslm.OLS(y, statstools.add_constant(x), missing='raise', hasconst=None) results = lm.fit() print("Intercept, Coefficient: {}, p-value: {}".format( results.params[0], results.pvalues[0])) for idx in range(len(group_idxs)): pval = results.pvalues[idx + 1] print("Group: {}, Coefficient: {}, p-value: {}".format( group_idxs[idx][1], results.params[idx + 1], results.pvalues[idx + 1]))
def fit(self, x, y, cens, verbose=False): """ Fit a maximum-likelihood Tobit regression :param x: Pandas DataFrame (n_samples, n_features): Data :param y: Pandas Series (n_samples,): Target :param cens: Pandas Series (n_samples,): -1 indicates left-censored samples, 0 for uncensored, 1 for right-censored :param verbose: boolean, show info from minimization :return: """ x_copy = x.copy() with warnings.catch_warnings(): warnings.filterwarnings("ignore") init_reg = sm.OLS(y, x_copy).fit() b0 = init_reg.params.values y_pred = init_reg.predict(x_copy) resid = y - y_pred resid_var = np.var(resid) s0 = np.sqrt(resid_var) params0 = np.append(b0, s0) #params0 = b0 xs, ys = split_left_right_censored(x_copy, y, cens) result = minimize( lambda params: tobit_neg_log_likelihood(xs, ys, params), params0, method='BFGS', jac=lambda params: tobit_neg_log_likelihood_der(xs, ys, params), options={'disp': verbose}) if verbose: print(result) self.ols_coef_ = b0[1:] self.ols_intercept = b0[0] if self.fit_intercept: self.intercept_ = result.x[1] self.coef_ = result.x[1:-1] else: self.coef_ = result.x[:-1] self.intercept_ = 0 self.sigma_ = result.x[-1] self.result = result return self
def backwardElimination(x, Y, sl, columns): numVars = len(columns.columns) numVars2 = len(columns.columns) print("\nStarting backward elimination...") for i in range(0, numVars): regressor_OLS = sm.OLS(endog = Y.astype(float), exog = columns.astype(float)).fit() maxVar = float(max(regressor_OLS.pvalues)) print("Highest P value is: "+str(maxVar)) if maxVar > sl: for j in range(0, numVars2): if (regressor_OLS.pvalues[j].astype(float) == maxVar): print("Removing column "+str(j)) columns = columns.drop(columns.columns[j], axis=1) numVars2 -= 1 print(regressor_OLS.summary()) return columns
def polynomial_regression(): ''' Define the model directly through the design matrix. Similar to MATLAB's "regress" command ''' # Generate the data t = np.arange(0, 10, 0.1) y = 4 + 3 * t + 2 * t**2 + 5 * np.random.randn(len(t)) # Make the fit. Note that this is another "OLS" than the one in "model_formulas"! M = np.column_stack((np.ones(len(t)), t, t**2)) res = sm.OLS(y, M).fit() # Display the results print 'Summary:' print res.summary() print 'The fit parameters are: {0}'.format(str(res.params)) print 'The confidence intervals are:' print res.conf_int()
def smart_regression(df, ignore_cols): endog = df.iloc[:, 0] exog = df.iloc[:, 1:] exog = exog.drop(ignore_cols, axis=1) vifs = get_vif(exog) large_vifs = {k: vifs[k] for k in vifs.keys() if vifs[k] > 10} coef = {} if len(large_vifs) != 0: max_vif = max(large_vifs, key=large_vifs.get) ignore_cols.append(max_vif) coef = smart_regression(df, ignore_cols) else: exog = sm.add_constant(exog) model = smlr.OLS(endog, exog).fit() coef = model.params return coef
def apply_OLS(self, ): y_fit = np.log(self.data[self.column_y]) if not hasattr(self, 'X'): self.create_X_variables() with warnings.catch_warnings(): warnings.filterwarnings("ignore") model = sm.OLS(y_fit, self.X.astype(float)).fit() predictions = model.predict(self.X.astype(float)) print_model = model.summary() prstd, iv_l, iv_u = wls_prediction_std(model) # salvando informacoes do OLS self.fit_ols_predictions = predictions self.fit_ols_stats = print_model self.fit_ols_iv_l = iv_l self.fit_ols_iv_u = iv_u self.fit_ols_stats = prstd return model
def iteration(e0, depth): y_star = y_data Wald_iter_df = pd.DataFrame(index=range(depth), columns=y_star.columns) for d in range(depth): e = e0.sample(frac=1).reset_index(drop=True) for i in range(34): ei = e.iloc[:, i] y_star_lag = pd.Series(index=y_star.index) y_star_lag[1:] = y_star.iloc[:-1, i] y_star.iloc[1:, i] = ei + Params_df.loc[ 'const', i] + Params_df.loc[0, i] * y_star_lag[1:] # 生成y-star # y_star.iloc[0, i] = y_data.iloc[0, i] x_lag1 = pd.Series(index=y_star.index) y_lag1 = pd.Series(index=y_star.index) x_lag1[1:] = X_data.iloc[:-1, i] y_lag1[1:] = y_star.iloc[:-1, i] y_reg = y_star.iloc[:, i] X_exo = pd.concat([y_lag1, x_lag1], axis=1) X_exo = lm.add_constant(X_exo) model = lm.OLS(y_reg, X_exo, missing='drop') res = model.fit() R = np.eye(len(res.params))[2] # print(res.wald_test(R).fvalue[0][0]) # Wald_iter_df.iloc[d, i] = res.wald_test(R).fvalue[0][0] try: wald_i = res.wald_test(R).fvalue[0][0] except ValueError: Wald_iter_df.iloc[d, i] = np.nan print(d, i, "Appear") # print(X_exo) else: Wald_iter_df.iloc[d, i] = wald_i return Wald_iter_df
def polynomial_regression(): ''' Define the model directly through the design matrix. Similar to MATLAB's "regress" command ''' # Generate the data # To get reproducable values, I provide a seed value np.random.seed(987654321) t = np.arange(0, 10, 0.1) y = 4 + 3 * t + 2 * t**2 + 5 * np.random.randn(len(t)) # Make the fit. Note that this is another "OLS" than the one in "model_formulas"! M = np.column_stack((np.ones(len(t)), t, t**2)) res = sm.OLS(y, M).fit() # Display the results print 'Summary:' print res.summary() print 'The fit parameters are: {0}'.format(str(res.params)) print 'The confidence intervals are:' print res.conf_int() return res.params # should be [ 4.74244177, 2.60675788, 2.03793634]