-
Notifications
You must be signed in to change notification settings - Fork 0
/
lin_reg_L2.py
69 lines (50 loc) · 2.2 KB
/
lin_reg_L2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import csv
import numpy as np
import pandas as pd
import matplotlib
import DataPrep
import matplotlib.pyplot as plt
from scipy.stats import skew
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, median_absolute_error, explained_variance_score, r2_score
# Somewhat more advanced regression, performs L1 regularization to eliminate some variables that are not necessary
headless_run = True
def run():
# Data preprocessing
train = DataPrep.prep_data(headless_run)
# Scale data: https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use
target = train.SalePrice
train = train.drop(columns='SalePrice')
X_train, X_test, y_train, y_test = train_test_split(
train, target, test_size=0.25, random_state=0)
# Trying L2 regularization
clf = RidgeCV(cv=5).fit(X_train, y_train)
# print(rmse_cv(clf).mean())
# Lasso gives us an alpha of 0.1231, picks some coefficients and gives the rest a 0 value
coef = pd.Series(clf.coef_, index=X_train.columns)
# Metrics
variance_score = clf.score(X_test, y_test)
MSEscore = mean_squared_error(clf.predict(X_test), y_test)
MAEscore = median_absolute_error(clf.predict(X_test), y_test)
R2score = r2_score(clf.predict(X_test), y_test)
if not headless_run:
print('Variance score: {}'.format(variance_score))
# print("CLF best: {}".format(clf.best_score_)) grid search only
print('MSE score: {}'.format(MSEscore))
print('MAE score: {}'.format(MAEscore))
print('R2 score: {}'.format(R2score))
# Plotting Residuals
plt.scatter(clf.predict(X_train), clf.predict(X_train) - y_train,
color="green", s=10, label='Train data')
plt.scatter(clf.predict(X_test), clf.predict(X_test) - y_test,
color="blue", s=10, label='Test data')
plt.hlines(y=0, xmin=10, xmax=14, linewidth=2)
plt.legend(loc='upper right')
plt.title("Residual errors")
plt.show()
else:
return [variance_score,MSEscore,MAEscore,R2score]
if __name__ == "__main__":
headless_run = False
run()