-
Notifications
You must be signed in to change notification settings - Fork 0
/
fit-predict-persist-house.py
101 lines (81 loc) · 3.24 KB
/
fit-predict-persist-house.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import requests
import json
import csv
import os
import time
import numpy as np
import matplotlib.pyplot as plt
from sklearn.externals import joblib
from mlhelper import readCsvIntoPandasDataframe, preprocessData, preprocessDataWithoutScale, plot_learning_curve
from sklearn import neighbors, cross_validation, metrics, ensemble, svm
from sklearn.cross_validation import KFold, StratifiedKFold
def plotImportances(clf, header, title):
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure()
plt.title(title)
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, header[sorted_idx]) # boston.feature_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show(block=False)
return header[sorted_idx], feature_importance[sorted_idx]
csvfile = 'HousePriceList_All_Train.csv'
# adding attributes did not make a difference
#csvfile = 'rentList_All_withAllAttr_trimmed.csv'
#csvfile = 'rentList_E_EC_N_final.csv'
data = readCsvIntoPandasDataframe(csvfile)
X, y, X_scaler, y_scaler, header, imp, vec = preprocessData(data)
#print "header type:"
#print type(header).__name__
#print header
#X, y = preprocessDataWithoutScale(data)
joblib.dump(X_scaler, 'pickle-house/X_scaler.pkl')
joblib.dump(y_scaler, 'pickle-house/y_scaler.pkl')
joblib.dump(imp, 'pickle-house/Imputer.pkl')
joblib.dump(vec, 'pickle-house/Vector.pkl')
estimators = []
# K-Nearest Neighbors
estimators.append( {"name": "KNN",
"model": neighbors.KNeighborsRegressor(
weights = "uniform", n_neighbors = 5) } )
# Gradient Boosting Regressor
estimators.append( {"name": "GBR",
"model": ensemble.GradientBoostingRegressor(
max_features = 0.1, n_estimators = 2100,
max_depth = 6, min_samples_leaf = 1, learning_rate = 0.02) })
# Random Forest
estimators.append( {"name": "RF",
"model": ensemble.RandomForestRegressor(
max_features = 0.1, n_estimators = 512 ) })
# Support Vector Machine
estimators.append( {"name": "SVR",
"model": svm.SVR(cache_size=1000, kernel='poly',
C = 1, gamma = 0.1, epsilon = 0.1, degree = 3) })
#kf = KFold(len(X), n_folds=10, shuffle=True, random_state=17) #3) #70) #48)
#kf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=3) #70) #48)
for estimator in estimators:
print "== " + estimator['name'] + " =="
model = estimator['model']
start = time.time()
model.fit(X, y)
print("Fitting took %.2f seconds." % (time.time() - start) )
joblib.dump(model, 'pickle-house/' + estimator['name'] + '.pkl')
if(estimator['name'] == 'GBR' or estimator['name'] == 'RF'):
title = estimator['name']
hdr, imp = plotImportances(model, header, title)
#print hdr
#print imp
#o = np.concatenate((hdr, imp), axis=0)
o = np.column_stack((hdr, imp))
np.savetxt("pickle-house/FeatImp-" + estimator['name'] + ".csv",
o, delimiter=",", fmt="%s")
#fout = open("pickle/FeatImp-" + estimator['name'] + "-" + str(i) + ".csv","a")
#fout.write(hdr)
#fout.write(imp)
#fout.close()