/
multipleRegressions.py
62 lines (54 loc) · 2.64 KB
/
multipleRegressions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
def runRegression(y,x,data, cov_type='HC0'):
print("Covariance type: %s" %cov_type )
form = '{0} ~ {1}'.format(y,x)
mod = smf.ols(formula=form, data=data)
res = mod.fit(cov_type=cov_type)
return(res)
def runQuantRegression(y,x,data,quant=.5):
form = '{0} ~ {1}'.format(y,x)
mod = smf.quantreg(formula=form, data=data)
res = mod.fit(quant=.5)
return(res)
def runRegressions(y, xList, key_vars, data, specification_names=None, cov_type='HC0', pvalues=True):
"""
y - dependent variable (string)
xList - list of explanatory variable strings (list of Patsy compatible formulas)
key_vars - list of independent variables you wish to see results for
df - a Pandas DataFrame that contains the data (list)
specification_names - list of regression names (list of strings)
cov_type - type of covariance you wish to estimate the regression with (default is robust)
pvalues - whether you want to report parameter estimates with p-values or star notation (default p-values)
Description: runRegressions estimates multiple regression specifications
and returns coefficent and p-values estimates for key_vars and regressions
stats for each specification in a single table.
"""
# if no custom names are given, use numbers
if specification_names is None:
specification_names = [str(i) for i in range(len(xList))]
if len(specification_names) != len(xList):
print('specification names not the same length as independent variables list')
return(None)
# run regressions
regDict = {}
for x, name in zip(xList, specification_names):
print("Estimating specification: %s" %name)
regDict[name] = runRegression(y,x,data, cov_type='HC0')
print("Estimation done")
# extract estimates:
print("Extracting results")
outDict = {}
for key in regDict.keys():
# get regression statistics
outDict[key] = {'nObs':regDict[key].nobs,'R^2 adj':regDict[key].rsquared_adj.round(3), 'cond. num':regDict[key].condition_number}
# loop over variables of interest
if pvalues:
for var in key_vars:
outDict[key][var] = "%.04f (%.03f)" %(regDict[key].params[var], regDict[key].pvalues[var])
if pvalues == False:
outDict[key] = regDict[key].params.round(3).astype(str) + pd.cut(regDict[key].pvalues,bins=[0,.01,.05,.1,1], labels=['***','**','*','']).astype(str)
outdf =pd.DataFrame(outDict).transpose()
outdf.index.name='specification'
return(outdf)