forked from rlowrance/re-avm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
chart-05.py
348 lines (305 loc) · 12.2 KB
/
chart-05.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
'''create charts showing results of valgbr.py
INVOCATION
python chart-05.py [--data] [--test]
INPUT FILES
INPUT/linval/YYYYMM.pickle
OUTPUT FILES
WORKING/chart-05/[test-]data.pickle
WORKING/chart-05/[test-]YYYY.txt
where
YYYY in {2004 | 2005 | 2006 | 2007 | 2008 | 2009}
'''
from __future__ import division
import cPickle as pickle
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pdb
from pprint import pprint
import random
import sys
from AVM import AVM
from Bunch import Bunch
from columns_contain import columns_contain
from Logger import Logger
from ParseCommandLine import ParseCommandLine
from Path import Path
from valgbr import ResultKey, ResultValue
cc = columns_contain
def usage(msg=None):
print __doc__
if msg is not None:
print msg
sys.exit(1)
def make_control(argv):
# return a Bunch
print argv
if len(argv) not in (1, 2, 3):
usage('invalid number of arguments')
pcl = ParseCommandLine(argv)
arg = Bunch(
base_name='chart-05',
data=pcl.has_arg('--data'),
test=pcl.has_arg('--test'),
)
if len(argv) == 3:
both = arg.data and arg.test
if not both:
usage('there is an extra invocation option')
random_seed = 123
random.seed(random_seed)
dir_working = Path().dir_working()
debug = False
reduced_file_name = ('test-' if arg.test else '') + 'data.pickle'
# assure output directory exists
dir_path = dir_working + arg.base_name + '/'
if not os.path.exists(dir_path):
os.makedirs(dir_path)
return Bunch(
arg=arg,
debug=debug,
path_in_ege=dir_working + 'valgbr/*.pickle',
path_reduction=dir_path + reduced_file_name,
path_chart_base=dir_path,
random_seed=random_seed,
test=arg.test,
)
def make_charts_v1(df, control, ege_control):
'write one txt file for each n_months_back'
def make_subplot(test_period, n_months_back, loss_metric):
'mutate the default axes'
alphas = sorted(set(df.alpha))
l1_ratios = sorted(set(df.l1_ratio))
n_ticks = len(alphas) * len(l1_ratios)
line_index = -1
for units_X in ('natural', 'log'):
for units_y in ('natural', 'log'):
line_index += 1
x_label = []
y = np.empty((n_ticks,), dtype=float)
tick_index = -1
for alpha in alphas:
for l1_ratio in l1_ratios:
tick_index += 1
mask = (
(df.test_period == test_period) &
(df.n_months_back == n_months_back) &
(df.units_X == units_X) &
(df.units_y == units_y) &
(df.alpha == alpha) &
(df.l1_ratio == l1_ratio)
)
subset = df.loc[mask]
assert len(subset) == 1, subset
row = dict(subset.iloc[0])
y[tick_index] = row[loss_metric]
x_label.append('%3.1f-%4.2f' % (row['alpha'], row['l1_ratio']))
plt.plot(y / 1000.0,
label='%3s-%3s' % (units_X, units_y),
linestyle=[':', '-.', '--', '-'][line_index % 4],
color='bgrcmykw'[line_index % 8],
)
plt.xticks(range(len(y)), x_label, size='xx-small', rotation='vertical')
plt.yticks(size='xx-small')
plt.title('yr-mo %s-%s bk %d' % (test_period[:4], test_period[4:], n_months_back),
loc='left',
fontdict={'fontsize': 'xx-small', 'style': 'italic'},
)
def make_figure(year, months):
print 'make_figure', year, months
test_periods_typical = [str(year * 100 + month)
for month in months
]
test_periods = ('200902',) if year == 2009 else test_periods_typical
plt.figure() # new figure
# plt.suptitle('Loss by Test Period, Tree Max Depth, N Trees') # overlays the subplots
loss_metric = 'rmse'
loss_metric = 'mae'
axes_number = 0
n_months_backs = range(1, 7, 1)
last_test_period_index = len(test_periods) - 1
last_n_months_back_index = len(n_months_backs) - 1
for test_period_index, test_period in enumerate(test_periods):
for n_months_back_index, n_months_back in enumerate(n_months_backs):
axes_number += 1 # count across rows
plt.subplot(len(test_periods), len(n_months_backs), axes_number)
make_subplot(test_period, n_months_back, loss_metric)
if test_period_index == last_test_period_index:
# annotate the bottom row only
if n_months_back_index == 0:
plt.xlabel('alpha-l1_ratio')
plt.ylabel('%s x $1000' % loss_metric)
if n_months_back_index == last_n_months_back_index:
plt.legend(loc='best', fontsize=5)
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
out_suffix = '-%02d' % months if len(months) == 1 else ''
plt.savefig(control.path_chart_base + str(year) + out_suffix + '.pdf')
plt.close()
for year in (2004, 2005, 2006, 2007, 2008, 2009):
months = (2,) if year == 2009 else (2, 5, 8, 11)
for month in months:
make_figure(year, (month,))
make_figure(year, months)
if control.test:
break
def make_charts(df, control, ege_control):
'write one txt file for each n_months_back'
def make_subplot(year, month, n_months_back):
'mutate the default axes'
test_period = str(year * 100 + month) # yyyymm
mask = (
(df.test_period == test_period) &
(df.n_months_back == n_months_back)
)
subset = df.loc[mask]
y = subset.mae
y.index = subset.learning_rate.values
y = y.sort_index()
plt.plot(y / 1000.0)
x_label = ['%4.2f' % x for x in y.index]
plt.xticks(range(len(y)), x_label, size='xx-small', rotation='vertical')
plt.yticks(size='xx-small')
plt.title('%4d-%02d %1d' % (year, month, n_months_back),
loc='right',
fontdict={'fontsize': 'xx-small', 'style': 'italic'},
)
return
learning_rates = sorted(set(df.learning_rates))
n_ticks = len(learning_rates)
# OLD BELOW ME
l1_ratios = sorted(set(df.l1_ratio))
n_ticks = len(l1_ratios)
line_index = -1
for units_X in ('natural', 'log'):
for units_y in ('natural', 'log'):
line_index += 1
x_label = []
y = np.empty((n_ticks,), dtype=float)
tick_index = -1
for l1_ratio in l1_ratios:
tick_index += 1
mask = (
(df.test_period == test_period) &
(df.n_months_back == n_months_back) &
(df.units_X == units_X) &
(df.units_y == units_y) &
(df.alpha == alpha) &
(df.l1_ratio == l1_ratio)
)
if sum(mask) != 1:
# something is wrong
print 'mask', sum(mask)
print 'test_period', sum(df.test_period == test_period)
print 'n_months_back', sum(df.n_months_back == n_months_back)
print 'units_X', sum(df.units_X == units_X)
print 'units_y', sum(df.units_y == units_y)
print 'alpha', sum(df.alpha == alpha)
print 'l1_ratio', sum(df.l1_ratio == l1_ratio)
print test_period, n_months_back, units_X, units_y, alpha, l1_ratio
pdb.set_trace()
subset = df.loc[mask]
assert len(subset) == 1, subset.shape
row = dict(subset.iloc[0])
y[tick_index] = row[loss_metric]
x_label.append('%4.2f' % row['l1_ratio'])
plt.plot(y / 1000.0,
label='%3s-%3s' % (units_X, units_y),
linestyle=[':', '-.', '--', '-'][line_index % 4],
color='bgrcmykw'[line_index % 8],
)
plt.xticks(range(len(y)), x_label, size='xx-small', rotation='vertical')
plt.yticks(size='xx-small')
plt.title('%4.2f %s-%s %d' % (alpha, test_period[:4], test_period[4:], n_months_back),
loc='left',
fontdict={'fontsize': 'xx-small', 'style': 'italic'},
)
def make_figure(year):
print 'make_figure', year
plt.figure() # new figure
# plt.suptitle('Loss by Test Period, Tree Max Depth, N Trees') # overlays the subplots
axes_number = 0
n_months_backs = (1, 2, 3, 4, 5, 6)
month_seq = (2, 5, 8, 11)
for month in month_seq:
for n_months_back in n_months_backs:
axes_number += 1 # count across rows
plt.subplot(len(month_seq), len(n_months_backs), axes_number)
make_subplot(year, month, n_months_back)
# annotate the bottom row only
if month == 11:
if n_months_back == 1:
plt.xlabel('learning_rate')
plt.ylabel('mae x $1000')
if n_months_back == 6:
plt.legend(loc='best', fontsize=5)
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
plt.savefig(control.path_chart_base + str(year) + '.pdf')
plt.close()
for year in (2004, 2005, 2006, 2007, 2008, 2009):
make_figure(year)
if control.test:
break
return
def make_data(control):
'return data frame, ege_control'
def process_file(path, rows_list):
'mutate rows_list to include gscv object info at path'
print 'reducing', path
with open(path, 'rb') as f:
val_result, ege_control = pickle.load(f)
for k, v in val_result.iteritems():
actuals = v.actuals.values
predictions = v.predictions
if predictions is None:
print k
print 'predictions is missing'
pdb.set_trace()
errors = actuals - predictions
root_mean_squared_error = np.sqrt(np.sum(errors * errors) / (1.0 * len(errors)))
median_absolute_error = np.median(np.abs(errors))
row = {
'n_months_back': k.n_months_back,
'learning_rate': k.learning_rate,
'test_period': str(k.yyyymm),
'rmse': root_mean_squared_error,
'mae': median_absolute_error,
}
rows_list.append(row)
return ege_control # return last ege_control value (they should all be the same)
rows_list = []
for path in glob.glob(control.path_in_ege):
ege_control = process_file(path, rows_list)
if control.test:
break
df = pd.DataFrame(rows_list)
return df, ege_control # return last ege_control, not all
def main(argv):
control = make_control(argv)
sys.stdout = Logger(base_name=control.arg.base_name)
print control
if control.arg.data:
df, ege_control = make_data(control)
with open(control.path_reduction, 'wb') as f:
pickle.dump((df, ege_control, control), f)
else:
with open(control.path_reduction, 'rb') as f:
df, ege_control, data_control = pickle.load(f)
make_charts(df, control, ege_control)
print control
if control.test:
print 'DISCARD OUTPUT: test'
print 'done'
return
if __name__ == '__main__':
if False:
# avoid pyflakes warnings
pdb.set_trace()
pprint()
pd.DataFrame()
np.array()
AVM()
ResultKey
ResultValue
main(sys.argv)