/
linear_regression.py
143 lines (114 loc) · 4.97 KB
/
linear_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression as linreg
from sklearn.cross_validation import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from utils import read_data, scale_train_data, scale_test_data
def get_plotdir():
"get plot directory"
return 'linear_regression_plots/'
def make_plotdir():
"make plot directory on file system"
sns.set_style("darkgrid")
plotdir = get_plotdir()
if not os.access(plotdir, os.F_OK):
os.mkdir(plotdir)
return plotdir
def load_data(loansData, numeric_vars):
testData = loansData.sample(frac=0.25)
loansData = loansData.drop(testData.index)
dep_variables = 'Interest.Rate'
train_y = pd.Series( loansData[dep_variables] )
test_y = pd.Series( testData[dep_variables] )
train_df = pd.DataFrame( loansData[numeric_vars] )
test_df = pd.DataFrame( testData[numeric_vars] )
return train_df, train_y, test_df, test_y
def sort_coefs(cols, coefs, intercept):
"sort fit coefficients and variables"
# df = pd.Series(coefs, index=cols).sort_values()
plist = ((lab, val) for lab, val in zip(cols, coefs))
plist = sorted(plist, key=lambda e: np.abs(e[1]), reverse=True)
plist = pd.Series(data = (e[1] for e in plist), index = (e[0] for e in plist))
print('Training Fit:\nIntercept %22s %.6f' % ('', intercept))
print(plist)
return plist
def get_top_vars(plist, top=5):
new_list = plist[:top]
return list(new_list.index)
def cross_validate(clf, train_X, train_y, cv=5, print_out=False):
"cross-validate fit scores"
scores = cross_val_score(clf, train_X, train_y, cv=cv)
score = scores.mean()
score_std = scores.std()
if print_out:
print(" CV scores mean %.4f +- %.4f" % (score, 2.0*score_std))
print(" CV raw scores", scores)
return score, scores
def run_var_list(new_vars, loansData):
"run fit and predict with new variable list"
train_df, train_y, test_df, test_y = load_data(loansData, new_vars)
train_X, my_scaler = scale_train_data(train_df)
test_X = scale_test_data(my_scaler, test_df)
regr = linreg()
regr.fit(train_X, train_y)
sort_coefs(list(train_df.columns), regr.coef_, regr.intercept_)
cross_validate(regr, train_X, train_y, cv=10, print_out=True)
score = regr.score(train_X, train_y)
print('Regression fit R^2 score %.4f' % score)
pscore = regr.score(test_X, test_y)
print('Regression predict R^2 score %.4f' % pscore)
def get_numeric_vars():
return ['FICO.Score', 'Log.Amount.Requested', 'Home.Type',
'Revolving.CREDIT.Balance', 'Log.Monthly.Income', 'Log.CREDIT.Lines',
'Debt.To.Income.Ratio', 'Loan.Length', 'Loan.Purpose.Score',
'Amount.Funded.By.Investors', 'Inquiries.in.the.Last.6.Months']
def plot_predict_scatter(plotdir, label, pred, test_y):
plt.clf()
plt.scatter(test_y, pred, alpha=0.5, edgecolors='face')
plt.xlabel("Actual Interest Rate %")
plt.ylabel("Predicted Interest Rate %")
plt.title("Linear Regression Prediction (%s data)" % label)
plt.savefig(plotdir+"predict_scatter_"+label+".png")
def main():
"main program"
loansData = read_data()
numeric_vars = get_numeric_vars()
train_df, train_y, test_df, test_y = load_data(loansData, numeric_vars)
print("train_df head\n", train_df[:3])
print("train_y head\n", train_y[:3])
plotdir = make_plotdir()
# add scaling
train_X, my_scaler = scale_train_data(train_df)
test_X = scale_test_data(my_scaler, test_df)
regr = linreg()
regr.fit(train_X, train_y)
# print('regr methods', dir(regr))
# print('columns', list(train_df.columns), 'Intercept')
# print('coefs', regr.coef_, regr.intercept_)
coefs = sort_coefs(list(train_df.columns), regr.coef_, regr.intercept_)
fitpts = regr.predict(train_X)
plot_predict_scatter(plotdir, "train", fitpts, train_y)
cross_validate(regr, train_X, train_y, cv=10, print_out=True)
score = regr.score(train_X, train_y)
print('Regression fit R^2 score %.4f' % score)
pred = regr.predict(test_X)
# pscore = sum(np.array(test_y) == pred) # need np.tol.diff
pscore = sum(np.abs(test_y - pred)) / len(test_y)
print('Regression predict diff average %.4f' % pscore)
# pscore = np.sqrt(sum( (test_y - pred)*(test_y - pred) ))
pscore = regr.score(test_X, test_y)
print('Regression predict R^2 score %.4f' % pscore)
plot_predict_scatter(plotdir, "test", pred, test_y)
# try fit with fewer top variables: 5, 4, 3, 2
for top in range(5, 1, -1):
new_vars = get_top_vars(coefs, top)
print('new_vars', new_vars)
run_var_list(new_vars, loansData)
# scores are just as good with top 4 or 5 vars as with all numeric_vars
# scores almost as good with top 3 vars as with all numeric_vars, statistically ok
# scores not as good with top 2 vars as with all numeric_vars, statistically not ok
if __name__ == '__main__':
main()