/
pricer.py
221 lines (177 loc) · 7.15 KB
/
pricer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import MySQLdb
import MySQLdb.cursors
from pandas import DataFrame, Series
import pandas as pd
import pandas.io.sql as sql
import numpy as np
from os import sys
import json
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import utilities
reload(utilities)
import utilities as ut
from scipy.optimize import curve_fit
from sqlalchemy import create_engine
engine = create_engine('postgres://smqdidhwgocwmg:qRi2N64egyMRyHAN9tiQ42Bd0y@ec2-54-225-195-249.compute-1.amazonaws.com:5432/dbjjk6gfc81mbh')
def get_mae(prediction, target):
'''Mean Absolute Error'''
return np.abs(prediction - target).mean()
def exp_func(x, a, b, c, d):
'''Sum of two exponentials'''
return a * np.exp(-b * (x[0])) + c * np.exp(-d * x[1])
def exp_func_score(features, target, train_idcs, test_idcs):
'''Get Mean Absolute Error of sum of two exponentials function'''
mile_scale = 2000 # Helps fitting search
X = np.transpose(np.array(features))
y = np.array(target)
X[0,:] = 2014 - X[0,:] #Convert year to age
X[1,:] = X[1,:] / mile_scale
popt, pcov = curve_fit(
exp_func, X[:, train_idcs], y[train_idcs], maxfev=50000)
pred = exp_func(X[:, test_idcs], *popt)
mae = get_mae(pred, y[test_idcs])
return mae
def color_plot(predictor):
'''Plots predicted price as function of year and mileage'''
features = []
miles = np.linspace(10000, 200000, 100)
years = np.arange(1995, 2014)
for mile in miles:
for year in years:
features.append([year, mile])
features = np.array(features)
prediction = np.array(predictor.predict(features))
prediction = np.flipud(prediction.reshape((len(miles), len(years))))
fig = plt.figure()
plt.imshow(prediction, interpolation="nearest", aspect='auto')
plt.show()
fig.savefig('color.eps')
def line_plot(predictor):
'''Plots predicted price as function of year'''
fig = plt.figure()
years = np.arange(1995, 2014)
data = np.array([[year, 30000] for year in years])
prediction = np.array(predictor.predict(data))
plt.plot(years, prediction)
plt.show()
fig.savefig('line.eps')
def exclude_uni_outliers(df, name, mn, mx):
'''
Excludes univariate outliers
Args:
df - DataFrame
name - column name (string)
mn - minimum value to pass
mx - maximum value to pass
'''
df = df.ix[df[name] >= mn,:]
df = df.ix[df[name] <= mx,:]
df.index = range(len(df))
return df
def exclude_biv_outliers(df, x, y):
'''Excludes bivariate outliers in year/mileage relationship'''
group = df.groupby(by=['year'])
valid = [False for i in range(len(df))]
for i in range(len(df)):
year = df.ix[i, x]
miles = df.ix[i, y]
mn = group.mean().ix[year, y] - 1.6 * group.std().ix[year, y]
mx = group.mean().ix[year, y] + 1.6 * group.std().ix[year, y]
if (miles < mx and miles > mn):
valid[i] = True
df = df[valid]
return df
def make_unicode(s):
return unicode(s, 'utf-8', errors='ignore')
def main():
# Select all cars of desired models from database
models = {'accord', 'camry', 'civic','silverado'}
conn = MySQLdb.connect(
user="root",
passwd="",
db="carsdb",
cursorclass=MySQLdb.cursors.DictCursor)
read_table_name = "scraped"
cmd = "SELECT model, year, miles, price, url, body, title, date FROM " + \
read_table_name + \
" WHERE area='sfbay' AND model in " + str(tuple(models))
full = pd.io.sql.read_sql(cmd, con=engine)
print full
# UTF-8 encoding
#full['body'] = full['body'].apply(make_unicode)
#full['title'] = full['title'].apply(make_unicode)
# Exclude outliers
#full = exclude_uni_outliers(full, 'year', 1996, 2013)
#full = exclude_uni_outliers(full, 'miles', 1000, 210000)
#full = exclude_uni_outliers(full, 'price', 1000, 50000)
#full = exclude_biv_outliers(full, 'year', 'miles')
print "we are fine till removing outliers"
# Only show most recent posts on DealSpotter
full = full.sort('date', ascending=False)
num_on_web = 150
full['on_web'] = [True if i <
num_on_web else False for i in range(0, len(full))]
print full
# Initialize DataFrame to keep track of savings
delta_frame = DataFrame(columns=['url', 'delta'])
# Loop through models (accord, camry, etc) and grow delta_frame
for i, model in enumerate(models):
print(model)
# This model's subset of full dataframe
df = full[full['model'] == model]
df = df[['price', 'year', 'miles', 'url', 'date', 'on_web']]
on_web = df['on_web'] # keep track of indices
not_on_web = df['on_web'] == False # keep track of indices
# All training should be on cars not shown on DealSpotter
#feature_names = ['year', 'miles']
#features = df.ix[not_on_web, feature_names].values
#target = df.ix[not_on_web, 'price'].values
feature_names = ['year', 'miles']
features = df.ix[on_web, feature_names].values
target = df.ix[on_web, 'price'].values
print "These are the features::::::::::::::::::::"
print features
predictor = RandomForestRegressor(
n_estimators=100,
min_samples_split=20)
# If user just wants to do cross-validation on training data
if sys.argv[1] == 'xval':
# Exclude true test cases from cross-validation
(train_idcs, test_idcs) = ut.get_xval_indcs(len(features), .8)
predictor.fit(features[train_idcs,:], target[train_idcs])
predictions = np.array(predictor.predict(features[test_idcs,:]))
print('MAE = ' + str(get_mae(predictions, target[test_idcs])))
# If user wants to make predictions for real test cars, shown on DealSpotter
elif sys.argv[1] == 'real':
# Extract true test data for DealSpotter
web_features = df.ix[on_web, feature_names].values
web_target = df.ix[on_web, 'price'].values
# Fit model, make predictions
predictor.fit(features, target)
predictions = np.array(predictor.predict(web_features))
delta = predictions - web_target
model_delta_frame = DataFrame(
{'url': df.ix[on_web, 'url'].values, 'delta': delta})
delta_frame = delta_frame.append(model_delta_frame)
else:
raise ValueError("Please provide 'xval' or 'real'")
# If user wants to make predictions for real test cars, shown on DealSpotter
if sys.argv[1] == 'real':
print(delta_frame)
print "inside the secoind real arguemtn"
# Merge savings information with original data frame
full = full.merge(delta_frame, on='url', how='inner')
# Write to database
write_table_name = 'priced'
ut.drop_if_exists(engine, write_table_name)
ut.prepare_table_w_textcols(
full, write_table_name, engine, ['body', 'title'])
pd.io.sql.to_sql(
full,
write_table_name,
engine,
index=False,
if_exists="append")
if __name__ == "__main__":
main()