def create_linear_model(X_train, X_test, Y_train, Y_test): ''' TODO... - Predict the wine quality using the test set and compare the accuracy to the actual quality. Comment. - Print the parameter estimates and their 95% confidence intervals in a single table. (Suggest using confint()), and cbind() ''' X_train = add_constant(X_train) regressionResult = OLS(Y_train, X_train).fit() print(regressionResult.summary()) # Print various attributes of the OLS fitted model # print("R Squared: {}".format(regressionResult.rsquared)) # print("SSE: {}".format(regressionResult.ess)) # print("SSR: {}".format(regressionResult.ssr)) # print("Residual MSE: {}".format(regressionResult.mse_resid)) # print("Total MSE: {}".format(regressionResult.mse_total)) # print("Model MSE: {}".format(regressionResult.mse_model)) # print("F-Value: {}".format(regressionResult.mse_model/regressionResult.mse_resid)) # print("NOBS: {}".format(regressionResult.nobs)) # print("Centered TSS: {}".format(regressionResult.centered_tss)) # print("Uncentered TSS: {}".format(regressionResult.uncentered_tss)) # print("DF Model: {}".format(regressionResult.df_model)) # print("DF Resid: {}".format(regressionResult.df_resid)) # print("Standard Errors: {}".format(regressionResult.bse)) print("Confidence: {}".format(regressionResult.conf_int())) predictions = regressionResult.predict(X_train) nobs, p = X_train.shape eaic = extractAIC(nobs, p, Y_train, predictions) print("Extract AIC: {}".format(eaic)) params = regressionResult.params # n, p = X_test.shape # X_test = add_constant(X_test) # predictions = X_test.dot(params).reshape(n,1) # num_matches = 0 # for i in range(len(Y_test)): # p = int(round(predictions[i][0], 0)) # is_match = (Y_test[i] == p) # if is_match: # num_matches += 1 # print("Actual: {}, Predictions: {}... Match: {}".format(Y_test[i], p, is_match)) # print("Number of matches: {}, Total number of Instances: {}".format(num_matches, n)) # print("Percent correct guesses: {}%".format(round((num_matches/n)*100, 3))) return params
def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, nbins=30): """ Calculate local FDR values for a list of Z-scores. Parameters ---------- zscores : array-like A vector of Z-scores null_proportion : float The assumed proportion of true null hypotheses null_pdf : function mapping reals to positive reals The density of null Z-scores; if None, use standard normal deg : integer The maximum exponent in the polynomial expansion of the density of non-null Z-scores nbins : integer The number of bins for estimating the marginal density of Z-scores. Returns ------- fdr : array-like A vector of FDR values References ---------- B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups Model. Statistical Science 23:1, 1-22. Examples -------- Basic use (the null Z-scores are taken to be standard normal): >>> from statsmodels.stats.multitest import local_fdr >>> import numpy as np >>> zscores = np.random.randn(30) >>> fdr = local_fdr(zscores) Use a Gaussian null distribution estimated from the data: >>> null = EmpiricalNull(zscores) >>> fdr = local_fdr(zscores, null_pdf=null.pdf) """ from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.generalized_linear_model import families from statsmodels.regression.linear_model import OLS # Bins for Poisson modeling of the marginal Z-score density minz = min(zscores) maxz = max(zscores) bins = np.linspace(minz, maxz, nbins) # Bin counts zhist = np.histogram(zscores, bins)[0] # Bin centers zbins = (bins[:-1] + bins[1:]) / 2 # The design matrix at bin centers dmat = np.vander(zbins, deg + 1) # Use this to get starting values for Poisson regression md = OLS(np.log(1 + zhist), dmat).fit() # Poisson regression md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=md.params) # The design matrix for all Z-scores dmat_full = np.vander(zscores, deg + 1) # The height of the estimated marginal density of Z-scores, # evaluated at every observed Z-score. fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0])) # The null density. if null_pdf is None: f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi) else: f0 = null_pdf(zscores) # The local FDR values fdr = null_proportion * f0 / fz fdr = np.clip(fdr, 0, 1) return fdr
def local_fdr(zscores, null_proportion=1.0, null_pdf=None, deg=7, nbins=30): """ Calculate local FDR values for a list of Z-scores. Parameters ---------- zscores : array-like A vector of Z-scores null_proportion : float The assumed proportion of true null hypotheses null_pdf : function mapping reals to positive reals The density of null Z-scores; if None, use standard normal deg : integer The maximum exponent in the polynomial expansion of the density of non-null Z-scores nbins : integer The number of bins for estimating the marginal density of Z-scores. Returns ------- fdr : array-like A vector of FDR values References ---------- B Efron (2008). Microarrays, Empirical Bayes, and the Two-Groups Model. Statistical Science 23:1, 1-22. Examples -------- Basic use (the null Z-scores are taken to be standard normal): >>> fdr = local_fdr(zscores) Use a Gaussian null distribution estimated from the data: >>> null = EmpiricalNull(zscores) >>> fdr = local_fdr(zscores, null_pdf=null.pdf) """ from statsmodels.genmod.generalized_linear_model import GLM from statsmodels.genmod.generalized_linear_model import families from statsmodels.regression.linear_model import OLS # Bins for Poisson modeling of the marginal Z-score density minz = min(zscores) maxz = max(zscores) bins = np.linspace(minz, maxz, nbins) # Bin counts zhist = np.histogram(zscores, bins)[0] # Bin centers zbins = (bins[:-1] + bins[1:]) / 2 # The design matrix at bin centers dmat = np.vander(zbins, deg + 1) # Use this to get starting values for Poisson regression md = OLS(np.log(1 + zhist), dmat).fit() # Poisson regression md = GLM(zhist, dmat, family=families.Poisson()).fit(start_params=md.params) # The design matrix for all Z-scores dmat_full = np.vander(zscores, deg + 1) # The height of the estimated marginal density of Z-scores, # evaluated at every observed Z-score. fz = md.predict(dmat_full) / (len(zscores) * (bins[1] - bins[0])) # The null density. if null_pdf is None: f0 = np.exp(-0.5 * zscores**2) / np.sqrt(2 * np.pi) else: f0 = null_pdf(zscores) # The local FDR values fdr = null_proportion * f0 / fz fdr = np.clip(fdr, 0, 1) return fdr
def fit_ols(self): self.data_lag.loc[self.data_lag.fecha <= "2020-04-04", "days"] = 30 ts_ols = OLS( self.data_lag.iloc[:-1, ].fallecimientos, self.data_lag.iloc[:-1, ].drop(["fecha", "fallecimientos"], axis=1)).fit() sum = ts_ols.summary() predictions = pd.DataFrame( ts_ols.predict(self.forecast.drop("fecha", axis=1))) e = pd.DataFrame({ "Modelo": "OLS", "Predicción de hoy": [predictions.iloc[0, 0]], "Error de hoy": [ abs(predictions.iloc[0, 0] - self.dt.loc[len(self.dt) - 1, "fallecimientos"]) ] }) predictions["fecha"] = self.dt.loc[len(self.dt) - 1, "fecha"] predictions.columns = ["fallecimientos", "fecha"] predictions.reset_index(drop=True, inplace=True) for i in range(len(self.forecast)): c = 0 c += i predictions.loc[i, "fecha"] = predictions.fecha[i] + timedelta(days=c) new = pd.concat( (self.dt[["fallecimientos", "fecha"]], predictions.iloc[1:, :]), axis=0) new["Predicciones"] = np.where( new.fecha <= self.dt.loc[len(self.dt) - 1, "fecha"], "Real", "Pred") fig = px.bar( new, x="fecha", y="fallecimientos", color="Predicciones", ) # predictions.columns =["Predicciones_Fallecimientos", "fecha"] # # load = str(self.dt.loc[len(self.dt)-1, "fecha"] - timedelta(days=1)) # load = load[0:10] + ".pkl" # # with open(load, "rb") as file: # historic = pickle.load(file) # predictions["Error"] = 0 # p=pd.concat([predictions.reset_index(drop=True), historic], ignore_index=True) # p = p.loc[p.fecha <= self.dt.loc[len(self.dt)-1, "fecha"],:] # p.reset_index(drop=True, inplace=True) # for i in range(0,len(p)): # if self.dt.loc[len(self.dt)-1,"fecha"] == p.loc[i,"fecha"]: # p.loc[i,"Error"] = np.sqrt((self.dt.loc[len(self.dt)-1,"fallecimientos"] - p.loc[i,"Predicciones_Fallecimientos"])**2) # # save = str(self.dt.loc[len(self.dt)-1, "fecha"]) # save = save[0:10] + ".pkl" # # with open(save, "wb") as file: # pickle.dump(p, file) return e, fig, sum
def ols_sm(X_train, y_train, X_test): X_train = sm.add_constant( X_train) # adds col of ones for intercept coefficient in OLS model ols = OLS(y_train, X_train).fit() # with open('ols_model_summary.csv', 'w') as f: # f.write(ols.summary().as_csv()) with open('ols_model_summary.txt', 'w') as f: f.write(ols.summary().as_text()) # Plot True vs Predicted values to examine if linear model is a good fit fig = plt.figure(figsize=(12, 8)) X_test = sm.add_constant(X_test) plt.scatter(y_test, ols.predict(X_test)) plt.xlabel('True values') plt.ylabel('Predicted values') plt.title('True vs Predicted values') plt.show() plt.close() # Add quadratic term to X or take log of y to improve # Discern if a linear relationship exists with partial regression plots fig = plt.figure(figsize=(12, 8)) fig = sm.graphics.plot_partregress_grid(ols, fig=fig) plt.title('Partial Regression Plots') plt.show() plt.close() # Identify outliers and high leverage points # a. Identify outliers (typically, those data points with studentized residuals outside of +/- 3 stdev). # Temporarily remove these from your data set and re-run your model. # Do your model metrics improve considerably? Does this give you cause for more confidence in your model? # b. Identify those outliers that are also high-leverage points (high residual and high leverage --> high influence). fig, ax = plt.subplots(figsize=(12, 8)) fig = sm.graphics.influence_plot(ols, ax=ax, criterion="cooks") plt.show() fig, ax = plt.subplots(figsize=(8, 6)) fig = sm.graphics.plot_leverage_resid2(ols, ax=ax) plt.show() plt.close() # Confirm homoscedasticity (i.e., constant variance of residual terms) # If residuals exhibit a “funnel shaped” effect, consider transforming your data into logarithmic space. studentized_residuals = ols.outlier_test()[:, 0] y_pred = ols.fittedvalues fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(y_pred, studentized_residuals) ax.axhline(y=0.0, color='k', ls='--') ax.set_xlabel('Predicted y') ax.set_ylabel('Studentized Residuals') plt.show() plt.close() # Test if residuals are normally distributed in QQ plot # plots quantile of the normal distribution against studentized residuals # if sample quantiles are normally distributed, the dots will align with 45 deg line fig, ax = plt.subplots() sm.graphics.qqplot(studentized_residuals, fit=True, line='45', ax=ax) plt.show() plt.close() # Find influencial points in data # DFBETAS - standardized measure of how much each coefficient changes when that observation is left out threshold = 2. / len(X_train)**.5 infl = ols.get_influence() df = pd.DataFrame(infl.summary_frame().filter(regex="dfb")) inf = df[df > threshold].dropna(axis=0, how='all') print('Influencial points:\n', inf)
y_true = np.dot(exog, beta) y = y_true + sig_e * np.random.normal(size=nobs) endog = y print "DGP" print "nobs=%d, beta=%r, sig_e=%3.1f" % (nobs, beta, sig_e) mod_ols = OLS(endog, exog[:, :2]) res_ols = mod_ols.fit() #'cv_ls'[1000, 0.5][0.01, 0.45] tst = smke.TestFForm( endog, exog[:, :2], bw=[0.01, 0.45], var_type="cc", fform=lambda x, p: mod_ols.predict(p, x), estimator=lambda y, x: OLS(y, x).fit().params, nboot=1000, ) print "bw", tst.bw print "tst.test_stat", tst.test_stat print tst.sig print "tst.boots_results mean, min, max", ( tst.boots_results.mean(), tst.boots_results.min(), tst.boots_results.max(), ) print "lower tail bootstrap p-value", (tst.boots_results < tst.test_stat).mean() print "upper tail bootstrap p-value", (tst.boots_results >= tst.test_stat).mean() from scipy import stats
import pandas as pd from statsmodels.regression.linear_model import OLS import numpy as np np.set_printoptions(suppress=True) data = pd.read_csv('Dataset/dataset.csv') X = data["Head Size(cm^3)"].values y = data["Brain Weight(grams)"].values X = np.array(X, dtype='float64') y = np.array(y, dtype='float64') y = np.reshape(y, (len(y), 1)) X = np.column_stack([np.ones(len(X)), X]) # Implement the statsmodel function res = OLS(y, X).fit() # Theta values theta = res.params print(theta) # prediction ols_pred = res.predict() print(res.summary())
order = 3 exog = x**np.arange(order + 1) beta = np.array([1, 1, 0.1, 0.0])[:order+1] # 1. / np.arange(1, order + 2) y_true = np.dot(exog, beta) y = y_true + sig_e * np.random.normal(size=nobs) endog = y print('DGP') print('nobs=%d, beta=%r, sig_e=%3.1f' % (nobs, beta, sig_e)) mod_ols = OLS(endog, exog[:,:2]) res_ols = mod_ols.fit() #'cv_ls'[1000, 0.5][0.01, 0.45] tst = smke.TestFForm(endog, exog[:,:2], bw=[0.01, 0.45], var_type='cc', fform=lambda x,p: mod_ols.predict(p,x), estimator=lambda y,x: OLS(y,x).fit().params, nboot=1000) print('bw', tst.bw) print('tst.test_stat', tst.test_stat) print(tst.sig) print('tst.boots_results mean, min, max', (tst.boots_results.mean(), tst.boots_results.min(), tst.boots_results.max())) print('lower tail bootstrap p-value', (tst.boots_results < tst.test_stat).mean()) print('upper tail bootstrap p-value', (tst.boots_results >= tst.test_stat).mean()) from scipy import stats print('aymp.normal p-value (2-sided)', stats.norm.sf(np.abs(tst.test_stat))*2) print('aymp.normal p-value (upper)', stats.norm.sf(tst.test_stat))
class TestOLS(unittest.TestCase): """ Tests OLS regression with refactored matrix multiplication. """ def setUp(self): np.random.seed(0) b01, b11, b21 = 1, 2, -3 b02, b12, b22 = 2, -1, 4 n = 50 x1 = np.linspace(0, 10, n) x2 = np.linspace(10, 15, n) e = np.random.normal(size=n) * 10 y1 = b01 + b11 * x1 + b21 * x2 + e e = np.random.normal(size=n) * 10 y2 = b02 + b12 * x1 + b22 * x2 + e Y = pd.DataFrame(np.vstack((y1, y2)).T, columns=['y1', 'y2']) B = pd.DataFrame([[b01, b11, b21], [b02, b12, b22]]) X = pd.DataFrame(np.vstack((np.ones(n), x1, x2)).T, columns=['Intercept', 'x1', 'x2']) self.Y = Y self.B = B self.X = X self.r1_ = OLS(endog=y1, exog=X).fit() self.r2_ = OLS(endog=y2, exog=X).fit() self.tree = TreeNode.read(['(c, (b,a)y2)y1;']) self.results = "results" if not os.path.exists(self.results): os.mkdir(self.results) def tearDown(self): shutil.rmtree(self.results) def test_ols_immutable(self): # test to see if values in table get filtered out. # and that the original table doesn't change table = self.Y x = pd.DataFrame(self.X.values, columns=self.X.columns, index=range(100, 100 + len(self.X.index))) metadata = pd.concat((self.X, x)) exp_metadata = metadata.copy() ols('x1 + x2', self.Y, self.X) self.assertEqual(str(table), str(self.Y)) self.assertEqual(str(metadata), str(exp_metadata)) def test_ols_missing_metadata(self): # test to see if values in table get filtered out. # and that the original table doesn't change table = self.Y y = pd.DataFrame(self.Y.values, columns=self.Y.columns, index=range(100, 100 + len(self.Y.index))) table = pd.concat((self.Y, y)) ids = np.arange(100, 100 + len(self.X.index)) x = pd.DataFrame([[np.nan] * len(self.X.columns)] * len(ids), columns=self.X.columns, index=ids) metadata = pd.concat((self.X, x)) model = ols('x1 + x2', table, metadata) model.fit() # test prediction exp = pd.DataFrame({ 'y1': self.r1_.predict(), 'y2': self.r2_.predict() }, index=self.Y.index) res = model.predict() pdt.assert_frame_equal(res, exp) def test_ols_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) res = model.coefficients() pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({ 'y1': self.r1_.resid, 'y2': self.r2_.resid }, index=self.Y.index) res = model.residuals() pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({ 'y1': self.r1_.predict(), 'y2': self.r2_.predict() }, index=self.Y.index) res = model.predict() pdt.assert_frame_equal(res, exp) # make a small prediction fx = pd.DataFrame([[1, 1, 1], [1, 1, 2]], columns=['Intercept', 'x1', 'x2'], index=['f1', 'f2']) rp1 = self.r1_.predict([[1, 1, 1], [1, 1, 2]]) rp2 = self.r2_.predict([[1, 1, 1], [1, 1, 2]]) exp = pd.DataFrame({'y1': rp1, 'y2': rp2}, index=['f1', 'f2']) res = model.predict(X=fx) pdt.assert_frame_equal(res, exp) # test r2 self.assertAlmostEqual(model.r2, 0.21981627865598752) def test_ols_ilr_inv_test(self): model = ols('x1 + x2', self.Y, self.X) model.fit() basis, _ = balance_basis(self.tree) # test pvalues exp = pd.DataFrame({'y1': self.r1_.pvalues, 'y2': self.r2_.pvalues}) pdt.assert_frame_equal(model.pvalues, exp) # test coefficients exp = pd.DataFrame({'y1': self.r1_.params, 'y2': self.r2_.params}) exp = pd.DataFrame(ilr_inv(exp, basis), columns=['c', 'b', 'a'], index=self.X.columns) res = model.coefficients(tree=self.tree) pdt.assert_frame_equal(res, exp) # test residuals exp = pd.DataFrame({ 'y1': self.r1_.resid, 'y2': self.r2_.resid }, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.residuals(tree=self.tree) pdt.assert_frame_equal(res, exp) # test prediction exp = pd.DataFrame({ 'y1': self.r1_.predict(), 'y2': self.r2_.predict() }, index=self.Y.index) exp = pd.DataFrame(ilr_inv(exp, basis), index=self.Y.index, columns=['c', 'b', 'a']) res = model.predict(tree=self.tree) pdt.assert_frame_equal(res, exp) def test_tvalues(self): model = ols('x1 + x2', self.Y, self.X) model.fit() exp = pd.DataFrame({'y1': self.r1_.tvalues, 'y2': self.r2_.tvalues}) pdt.assert_frame_equal(model.tvalues, exp) def test_mse(self): model = ols('x1 + x2', self.Y, self.X) model.fit() exp = pd.Series({'y1': self.r1_.mse_resid, 'y2': self.r2_.mse_resid}) pdt.assert_series_equal(model.mse, exp) def test_ess(self): model = ols('x1 + x2', self.Y, self.X) model.fit() exp = pd.Series({'y1': self.r1_.ess, 'y2': self.r2_.ess}) pdt.assert_series_equal(model.ess, exp) def test_loo(self): model = ols('x1 + x2', self.Y, self.X) model.fit() res = model.loo() exp = pd.read_csv(get_data_path('loo.csv'), index_col=0) pdt.assert_frame_equal(res, exp) def test_kfold(self): model = ols('x1 + x2', self.Y, self.X) model.fit() res = model.kfold(9) exp = pd.read_csv(get_data_path('kfold.csv'), index_col=0) pdt.assert_frame_equal(res, exp) def test_lovo(self): model = ols('x1 + x2', self.Y, self.X) model.fit() res = model.lovo() exp = pd.read_csv(get_data_path('lovo.csv'), index_col=0) pdt.assert_frame_equal(res, exp)
x = np.vstack([dff_e[3], dff[43]]).T dta = lagmat2ds(x, 1, trim='both', dropex=1) dtaown = add_constant(dta[:, 1:(1 + 1)], prepend=False) dtajoint = add_constant(dta[:, 1:], prepend=False) dtaother = add_constant(dta[:, 2], prepend=False) dtaother.shape res2down = OLS(dta[:, 0], dtaown).fit() res2djoint = OLS(dta[:, 0], dtajoint).fit() from statsmodels.regression.linear_model import OLS, yule_walker res2down = OLS(dta[:, 0], dtaown).fit() res2djoint = OLS(dta[:, 0], dtajoint).fit() res2dother = OLS(dta[:, 0], dtaother).fit() res2dother.ssr res2dother.params res2down.ssr plt.plot(dta[:, 0]);plt.plot(res2dother.predict(res2dother.params, dtaother)) res2dother.params dta[:3] plt.plot(dta[:, 0]);plt.plot(res2dother.params[0] * dta[:, 2]) plt.plot(dta[:, 0]);plt.plot(5 * dta[:, 2]) np.sum(np.square(dta[:, 0] - 5 * dta[:, 2])) np.sum(np.square(dta[:, 0] - 5 * dta[:, 2] - res2dother.params[1])) np.sum(np.square(dta[:, 0] - 1.568331 * dta[:, 2] - res2dother.params[1])) plt.plot(dta[:, 0]);plt.plot(5 * dta[:, 2]) plt.plot(dta[:, 0]);plt.plot(dta[:, 2]**2) plt.plot(dta[:, 0]);plt.plot(dta[:, 2]**2 * 10) plt.plot(dta[:, 0]);plt.plot(dta[:, 2]**2 * 13) np.sum(np.square(dta[:, 0] - dta[:, 2] ** 2 * 13 - res2dother.params[1])) np.sum(np.square(dta[:, 0] - dta[:, 2] ** 2 * 13)) np.sum(np.square(dta[:, 0] - dta[:, 2] ** 2 * 13- dta[:, 2])) np.sum(np.square(dta[:, 0] - dta[:, 2] ** 2 * 13- 1.56* dta[:, 2]))
ss = SS() X = ss.fit_transform(births[columns].values.reshape(-1, len(columns))) X = add_constant(X) future_X = add_constant(ss.transform(future[columns])) # X = SS().fit_transform(births[columns].values.reshape(-1, len(columns))) # model1 = LR().fit(births[columns].values.reshape(-1,len(columns)), births['num_births']) model2 = OLS(births['num_births'], X).fit() bics.append(model2.bic) aics.append(model2.aic) plt.plot(births.index, model2.predict(X), label=f'{i}:{model2.bic:.3f}') plt.plot(future.index, model2.predict(future_X), marker='*', color='k', linestyle='-') plt.legend() plt.plot(original['num_births'], marker='*', linestyle='') plt.show() plt.plot(range(2, upper), aics) plt.plot(range(2, upper), bics) plt.axvline(2 + np.argmin(bics)) plt.axvline(2 + np.argmin(aics))
def ols(dep_vb, indep_vbs): model = OLS(dep_vb, indep_vbs).fit() prediction = model.predict() residuals = dep_vb - prediction return(model.params, residuals, prediction)
def ols(X, y): model = OLS(y, X).fit() prediction = model.predict() residuals = y - prediction return(model.params, residuals, prediction)
def statsmodel_regression(X_train, X_test, y_train, y_test): "Similar with above however uses statsmodel' regr = OLS(y_train, add_constant(X_train)).fit() predictions = regr.predict(X_test) r2_test = round(r2_score(y_test, predictions),2) return r2_test, regr
from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Ridge from statsmodels.discrete.discrete_model import MNLogit from statsmodels.regression.linear_model import OLS # we use the full df for the regression because we want to weight results by the # existence of different ads in different neighborhoods, not just unique addresses X = df[["black_proportion","log_income","asian_proportion","latinx_proportion","log_price"]] y = df.white_proportion df_tmp = df.copy() df_tmp[list(range(30))] = df_tmp[list(range(30))].where(df_tmp[list(range(30))]>.1,0) topic_0 + topic_7 + topic_8 + topic_9 + topic_12 + topic_14 + topic_16 + topic_17+ topic_20 + topic_23 + topic_24 + topic_25 + topic_28 X = df[[str(x) for x in [0,7,8,9,12,14,16,17,20,23,24,25,28]]+["black_proportion","log_income","log_price","total_RE"]] y = np.where(df['white_proportion']>np.median(df['white_proportion']),1,0) y= df['income'] OLR = OLS(y,X).fit() OLR.summary() OLR.predict(exog=X) df_full_results.params.sort_values() df_results.params.sort_values() df_results.summary() EN = ElasticNet(alpha = .02, l1_ratio=.001) EN.fit(X,y) EN.score(X,y) EN.predict(X) LinR = LinearRegression() LinR.fit(X,y) LinR.score(X,y) RR = Ridge() RR.fit(X,y).score(X,y) pd.Series(RR.coef_)