forked from rlowrance/re-avm
/
chart06.py
837 lines (755 loc) · 33.7 KB
/
chart06.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
'''create charts showing results of valgbr.py
INVOCATION
python chart06.py FEATURESGROUP-HPS-LOCALITY --data
python chart06.py FEATURESGROUP-HPS-global [--test] [--subset] [--norwalk] [--all]
python chart06.py FEATURESGROUP-HPS-city [--test] [--subset] [--norwalk] [--all] [--trace]
where
FEATURESGROUP is one of {s, sw, swp, swpn}
HPS is one of {all, best1}
LOCALILTY is one of {city, global}
FHL is FEATURESGROUP-HPS-LOCALITY
--test means to set control.arg.test to True
--subset means to process 0data-subset, not 0data, the full reduction
--norwalk means to process 0data-norwalk, not 0data, the full reduction
--all means to process all the cities, not just selected cities
--trace start with pdb.set_trace() call, so that we run under the debugger
INPUT FILES
WORKING/chart01/data.pickle
WORKING/valavm/FHL/YYYYMM.pickle
WORKING/samples-train-analysis/transactions.csv has transaction IDs
INPUT AND OUTPUT FILES (build with --data)
WORKING/chart06/FHL/0data.pickle | reduction for everything
WORKING/chart06/FHL/0data-norwalk.pickle | reduction for just Norwalk (for testing); only if locality == city
WORKING/chart06/FHL/0data-subset.pickle | random subset of everything (for testing)
WORKING/chart06/FHL/0all-price-histories.pickle |
OUTPUT FILES
WORKING/chart06/FHL/0data-report.txt | records retained TODO: Decide whether to keep
WORKING/chart06/FHL/a.pdf | range of losses by model (graph)
WORKING/chart06/FHL/b-YYYYMM.pdf | HPs with lowest losses
WORKING/chart06/FHL/b-YYYYMM.txt | HPs with lowest losses
WORKING/chart06/FHL/c.pdf | best model each month
WORKING/chart06/FHL/d.pdf | best & 50th best each month
WORKING/chart06/FHL/e.pdf | best 50 models each month (was chart07)
WORKING/chart06/FHL/best.pickle | dataframe with best choices each month # CHECK
WORKING/chart06/FHL/h.txt
WORKING chart06/FHL/i.txt
WORKING chart06/FHL/i.pdf | best value for k when L is global
The reduction is a dictionary.
- if LOCALITY is 'global', the type of the reduction is
dict[validation_month] sd
where sd is a sorted dictionary with type
dict[ModelDescription] ModelResults, sorted by increasing ModelResults.mae
- if LOCALITY is 'city', the type of the reduction is
dict[city_name] dict[validation_month] sd
'''
from __future__ import division
import argparse
import collections
import cPickle as pickle
import glob
import numpy as np
import pandas as pd
import pdb
from pprint import pprint
import random
import sys
import arg_type
from AVM import AVM
from Bunch import Bunch
from chart06_make_chart_a import make_chart_a
from chart06_make_chart_b import make_chart_b
from chart06_make_chart_cd import make_chart_cd
from chart06_make_chart_efgh import make_chart_efgh
from chart06_make_chart_hi import make_chart_hi
from chart06_types import ModelDescription, ModelResults, ColumnDefinitions
from columns_contain import columns_contain
import dirutility
import errors
from Logger import Logger
from Path import Path
from Report import Report
from Timer import Timer
from trace_unless import trace_unless
from valavmtypes import ResultKeyEn, ResultKeyGbr, ResultKeyRfr, ResultValue
cc = columns_contain
def make_control(argv):
'return a Bunch'
print argv
parser = argparse.ArgumentParser()
parser.add_argument('invocation')
parser.add_argument('fhl', type=arg_type.features_hps_locality)
parser.add_argument('--data', action='store_true')
parser.add_argument('--test', action='store_true')
parser.add_argument('--debug', action='store_true')
parser.add_argument('--subset', action='store_true')
parser.add_argument('--norwalk', action='store_true')
parser.add_argument('--all', action='store_true')
parser.add_argument('--trace', action='store_true')
parser.add_argument('--use-samples-train-analysis-test', action='store_true')
arg = parser.parse_args(argv) # arg.__dict__ contains the bindings
arg.base_name = arg.invocation.split('.')[0]
# for now, we only know how to process global files
# local files will probably have a different path in WORKING/valavm/
# details to be determined
arg.features, arg.hsheps, arg.locality = arg.fhl.split('-')
assert arg.locality == 'global' or arg.locality == 'city', arg.fhl
if arg.norwalk:
assert arg.locality == 'city', argv
if arg.trace:
pdb.set_trace()
random_seed = 123
random.seed(random_seed)
# assure output directory exists
dir_working = Path().dir_working()
dir_out_reduction = dirutility.assure_exists(dir_working + arg.base_name) + '/'
dir_out = dirutility.assure_exists(dir_out_reduction + arg.fhl) + '/'
validation_months = (
'200612',
'200701', '200702', '200703', '200704', '200705', '200706',
'200707', '200708', '200709', '200710', '200711',
)
validation_months_long = (
'200512',
'200601', '200602', '200603', '200604', '200605', '200606',
'200607', '200608', '200609', '200610', '200611', '200612',
'200701', '200702', '200703', '200704', '200705', '200706',
'200707', '200708', '200709', '200710', '200711', '200712',
'200801', '200802', '200803', '200804', '200805', '200806',
'200807', '200808', '200809', '200810', '200811', '200812',
'200901', '200902',
)
def all_k_values():
ks = range(1, 31, 1)
ks.extend([40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200])
return ks
return Bunch(
all_k_values=all_k_values(),
arg=arg,
column_definitions=ColumnDefinitions(),
debug=arg.debug,
errors=[],
exceptions=[],
path_in_valavm='%svalavm/%s/*.pickle' % (
dir_working,
arg.fhl,
),
path_in_chart_01_reduction=dir_working + 'chart01/0data.pickle',
path_in_data=dir_out + (
'0data-subset.pickle' if arg.subset else
'0data-norwalk.pickle' if arg.norwalk else
'0data.pickle'
),
path_in_interesting_cities=dir_working + 'interesting_cities.txt',
path_in_transactions=(
dir_working +
'samples-train-analysis%s/transactions.csv' % ('-test' if arg.use_samples_train_analysis_test else '')
),
path_all_price_histories=dir_out + '0all_price_histories.pickle',
path_out_a=dir_out + 'a.pdf' if arg.locality == 'global' else dir_out + 'a-%s.pdf',
path_out_b=dir_out + 'b-%d.txt',
path_out_cd=dir_out + '%s.txt',
path_out_c_pdf=dir_out+'c.pdf',
path_out_b_pdf_subplots=dir_out + 'b.pdf',
path_out_b_pdf=dir_out + 'b-%d.pdf',
path_out_d=dir_out + 'd.txt',
path_out_e_txt=dir_out + 'e-%04d-%6s.txt',
path_out_e_pdf=dir_out + 'e-%04d.pdf',
path_out_f=dir_out + 'f-%04d.txt',
path_out_g=dir_out + 'g.txt',
path_out_h_template=dir_out + ('h-%03d-%6s' if arg.locality == 'global' else 'h-%s-%03d-%6s') + '.txt',
path_out_i_template=dir_out + ('i' if arg.locality == 'global' else 'i-%s') + '.txt',
path_out_i_all_1_only_pdf=dir_out + 'i1-only.pdf',
path_out_i_all_1_skip_pdf=dir_out + 'i1-skip.pdf',
path_out_i_all_12_pdf=dir_out + 'i12-all.pdf',
path_out_i_le_50_12_pdf=dir_out + 'i12-le50.pdf',
path_out_data=dir_out + '0data.pickle',
path_out_data_report=dir_out + '0data-report.txt',
path_out_data_subset=dir_out + '0data-subset.pickle',
path_out_data_norwalk=dir_out + '0data-norwalk.pickle',
path_out_log=dir_out + '0log' + ('-data' if arg.data else '') + '.txt',
random_seed=random_seed,
sampling_rate=0.02,
selected_cities=(
'BEVERLY HILLS', 'CANYON COUNTRY', # low number of transactions; high/low price
'SHERMAN OAKS', 'POMONA', # high number of transactions; high/low price
'LOS ANGELES',
),
test=arg.test,
timer=Timer(),
validation_months=validation_months,
validation_months_long=validation_months_long,
)
def select_and_sort(df, year, month, model):
'return new df contain sorted observations for specified year, month, model'
yyyymm = str(year * 100 + month)
mask = (
(df.model == model) &
(df.validation_month == yyyymm)
)
subset = df.loc[mask]
if len(subset) == 0:
print 'empty subset'
print year, month, model, sum(df.model == model), sum(df.validation_month == yyyymm)
pdb.set_trace()
return subset.sort_values('mae')
def check_actuals(actuals):
'each should be the same'
k = len(actuals)
assert k > 0, k
first = actuals[0]
for other in actuals:
if collections.Counter(first) != collections.Counter(other):
print collections.Counter(first), collections.Counter(other)
pdb.set_trace()
def make_ensemble_predictions(predictions, weights):
'return vector of predictions: sum w_i pred_i / sum w_i'
sum_weighted_predictions = np.array(predictions[0])
sum_weighted_predictions.fill(0.0)
for index in xrange(len(weights)):
sum_weighted_predictions = np.add(
sum_weighted_predictions,
np.dot(predictions[index], weights[index]))
sum_weights = np.sum(np.array(weights))
result = sum_weighted_predictions / sum_weights
return result
def check_key_order(d):
keys = d.keys()
for index, key1_key2 in enumerate(zip(keys, keys[1:])):
key1, key2 = key1_key2
# print index, key1, key2
mae1 = d[key1].mae
mae2 = d[key2].mae
trace_unless(mae1 <= mae2, 'should be non increasing',
index=index, mae1=mae1, mae2=mae2,
)
# return string describing key features of the model
def short_model_description(model_description):
# build model decsription
model = model_description.model
if model == 'gb':
description = '%s(%d, %d, %s, %d, %3.2f)' % (
model,
model_description.n_months_back,
model_description.n_estimators,
model_description.max_features,
model_description.max_depth,
model_description.learning_rate,
)
elif model == 'rf':
description = '%s(%d, %d, %s, %d)' % (
model,
model_description.n_months_back,
model_description.n_estimators,
model_description.max_features,
model_description.max_depth,
)
else:
assert model == 'en', model_description
description = '%s(%f, %f)' % (
model,
model_description.alpha,
model_description.l1_ratio,
)
return description
def make_charts(reduction, actuals, median_prices, control):
print 'making charts'
make_chart_a(reduction, median_prices, control)
make_chart_hi(reduction, actuals, median_prices, control)
return # charts b - g are obselete
if control.arg.locality == 'city':
print 'stopping charts after chart a and h, since locality is', control.arg.locality
return
make_chart_b(reduction, control, median_prices)
make_chart_cd(reduction, median_prices, control, (0,), 'c')
for n_best in (5, 100):
report_id = 'd-%0d' % n_best
for validation_month, month_reduction in reduction.iteritems():
n_reductions_per_month = len(month_reduction)
break
detail_lines_d = range(n_best)[:n_reductions_per_month]
make_chart_cd(reduction, median_prices, control, detail_lines_d, report_id)
make_chart_efgh(reduction, actuals, median_prices, control)
def extract_yyyymm(path):
file_name = path.split('/')[-1]
base, suffix = file_name.split('.')
yyyymm = base.split('-')[-1]
return yyyymm
class ReductionIndex(object):
'reduction DataFrame multiindex object'
def __init__(self, city, validation_month, model_description):
self.city = city
self.validation_month = validation_month
self.model_description = model_description
def __hash__(self):
return hash((self.city, self.validation_month, self.model_description))
def __repr__(self):
pattern = 'ReductionIndex(city=%s, validation_month=%s, model_description=%s)'
return pattern % (self.city, self.validation_month, self.model_description)
class ReductionValue(object):
'reduction DataFrame value object'
def __init__(self, mae, model_results, feature_group):
self.mae = mae
self.model_results = model_results
self.feature_group = feature_group
def __hash__(self):
return hash((self.mae, self.model_results, self.feature_group))
def __repr__(self):
pattern = 'ReductionValue(mae = %f, model_results=%s, feature_group=%s)'
return pattern % (self.mae, self.model_results, self.feature_group)
def make_reduction(control):
'''return the reduction dict'''
def path_city(path):
'return city in path to file'
last = path.split('/')[-1]
date, city = last.split('.')[0].split('-')
return city
def process_path(path):
''' return (dict, actuals, counter) for the path where
dict has type dict[ModelDescription] ModelResult
'''
def make_model_description(key):
is_en = isinstance(key, ResultKeyEn)
is_gbr = isinstance(key, ResultKeyGbr)
is_rfr = isinstance(key, ResultKeyRfr)
is_tree = is_gbr or is_rfr
result = ModelDescription(
model='en' if is_en else ('gb' if is_gbr else 'rf'),
n_months_back=key.n_months_back,
units_X=key.units_X if is_en else 'natural',
units_y=key.units_y if is_en else 'natural',
alpha=key.alpha if is_en else None,
l1_ratio=key.l1_ratio if is_en else None,
n_estimators=key.n_estimators if is_tree else None,
max_features=key.max_features if is_tree else None,
max_depth=key.max_depth if is_tree else None,
loss=key.loss if is_gbr else None,
learning_rate=key.learning_rate if is_gbr else None,
)
return result
def make_model_result(value):
rmse, mae, low, high = errors.errors(value.actuals, value.predictions)
result = ModelResults(
rmse=rmse,
mae=mae,
ci95_low=low,
ci95_high=high,
predictions=value.predictions,
)
return result
def update_price_history(
price_history=None,
ids=None,
validation_year=None,
validation_month=None,
valavm_result_value=None,
valavm_key=None,
):
'return (augmented price history, counter)'
def same_apn(df):
return (df.apn == df.iloc[0].apn).all()
def same_actual_price(df):
return (df.actual_price == df.iloc[0].actual_price).all()
def make_data_dict(key, matched):
'return dictionary with the columns with want in the price history'
# common columns
assert isinstance(key, (ResultKeyEn, ResultKeyGbr, ResultKeyRfr)), (key, type(key))
# common results (across all methods)
result = {
'apn': matched.apn,
'year': matched.year,
'month': matched.month,
'day': matched.day,
'sequence_number': matched.sequence_number,
'date': matched.date,
'price_actual': price_actual,
'price_estimated': price_estimated,
'n_months_back': key.n_months_back,
'method': (
'rfr' if isinstance(key, ResultKeyRfr) else
'gbr' if isinstance(key, ResultKeyGbr) else
'en'
),
}
# add columns appropriate for the type of result
if isinstance(key, (ResultKeyRfr, ResultKeyGbr)):
result.update({
'n_esimators': key.n_estimators,
'max_features': key.max_features,
'max_depth': key.max_depth,
})
if isinstance(key, ResultKeyGbr):
result.update({
'loss': key.loss,
'learning_rate': key.learning_rate,
})
if isinstance(key, ResultKeyEn):
result.update({
'units_X': key.units_X,
'units_y': key.units_y,
'alpha': key.alpha,
'l1_ratio': key.l1_ratio,
})
return result
debug = False
verbose = False
if debug:
print validation_year, validation_month,
print len(ids)
print ids[:5]
print key
print 'len(price_history)', 0 if price_history is None else len(price_history)
assert len(valavm_result_value.actuals) == len(valavm_result_value.predictions)
counter = collections.Counter()
counter_key = 'year, month, price matched %d times'
result_price_history = price_history
for price_actual, price_estimated in zip(valavm_result_value.actuals, valavm_result_value.predictions):
# print price_actual, price_estimated
mask1 = ids.year == validation_year
mask2 = ids.month == validation_month
mask3 = ids.actual_price == price_actual
mask = mask1 & mask2 & mask3
if debug:
print 'transaction in validation year: ', sum(mask1)
print 'transactions in validation month: ', sum(mask2)
print 'transactions with same actual price;', sum(mask3)
print 'transactions that match on all: ', sum(mask)
print type(key)
matched_df = ids[mask]
if len(matched_df) == 1:
counter[counter_key % 1] += 1
matched = matched_df.iloc[0]
print 'single match', matched_df.index[0]
data_dict = make_data_dict(key, matched)
df_new = pd.DataFrame(
data=data_dict,
index=[0 if result_price_history is None else len(result_price_history)],
)
# pdb.set_trace()
result_price_history = (
df_new if result_price_history is None else
result_price_history.append(df_new, verify_integrity=True)
)
else:
counter[counter_key % len(matched_df)] += 1
if len(matched_df) > 0:
if verbose:
print 'valavm matched %d training transactions' % len(matched_df)
print matched_df
continue
if control.debug and len(result_price_history) > 2:
print 'DEBUG: breaking out of update_price_history'
break
print 'identified %d price histories' % len(result_price_history)
return result_price_history, counter
ids = pd.read_csv(
control.path_in_transactions,
index_col=0,
low_memory=False,
)
print 'reducing', path
model = {}
counter = collections.Counter()
input_record_number = 0
actuals = None
updated_price_history = None # mutated in the while True loop below
validation_year_month = int(path.split('/')[-1].split('.')[0])
validation_year = int(validation_year_month / 100)
validation_month = int(validation_year_month - validation_year * 100)
assert validation_year_month == validation_year * 100 + validation_month
with open(path, 'rb') as f:
while True: # process each record in path
counter['attempted to read'] += 1
input_record_number += 1
if control.debug and input_record_number > 10:
print 'DEBUG: breaking out of record read in path', path
break
try:
# model[model_key] = error_analysis, for next model result
record = pickle.load(f)
counter['actually read'] += 1
assert isinstance(record, tuple), type(record)
assert len(record) == 2, len(record)
key, value = record
assert len(value) == 2, len(value)
# NOTE: importances is not used
valavm_result_value, importances = value
# type(valavm_result_value) == namedtuple with fields actuals, predictions
# the fields are parallel, corresponding transaction to transaction
if len(ids) < 100000:
print 'WARNING: TRUNCATED IDS'
# verify that actuals is always the same
if actuals is not None:
assert np.array_equal(actuals, valavm_result_value.actuals)
actuals = valavm_result_value.actuals
# verify that each model_key occurs at most once in the validation month
model_key = make_model_description(key)
updated_price_history, update_counter = update_price_history(
price_history=updated_price_history,
ids=ids,
validation_year=validation_year,
validation_month=validation_month,
valavm_result_value=valavm_result_value,
valavm_key=key,
)
# NOTE: the counters are the same for every path because the actual transactions are the same
if input_record_number == 1:
print 'path update counters', path
for k, v in update_counter.iteritems():
print k, v
if model_key in model:
print '++++++++++++++++++++++'
print path, model_key
print 'duplicate model key'
pdb.set_trace()
print '++++++++++++++++++++++'
model[model_key] = make_model_result(valavm_result_value)
except ValueError as e:
counter['ValueError'] += 1
if key is not None:
print key
print 'ignoring ValueError in record %d: %s' % (input_record_number, e)
except EOFError:
counter['EOFError'] += 1
print 'found EOFError path in record %d: %s' % (input_record_number, path)
print 'continuing'
if input_record_number == 1 and False:
# with locality == city, a file can be empty
control.errors.append('eof record 1; path = %s' % path)
break
except pickle.UnpicklingError as e:
counter['UnpicklingError'] += 1
print 'cPickle.Unpicklingerror in record %d: %s' % (input_record_number, e)
return model, actuals, counter, updated_price_history
reduction = collections.defaultdict(dict)
all_actuals = collections.defaultdict(dict)
paths = sorted(glob.glob(control.path_in_valavm))
assert len(paths) > 0, paths
counters = {}
all_price_histories = None
for path in paths:
model, actuals, counter, price_history = process_path(path)
all_price_histories = (
price_history if all_price_histories is None else
all_price_histories.append(price_history, verify_integrity=True, ignore_index=True)
)
# type(model) is dict[ModelDescription] ModelResults
# sort models by increasing ModelResults.mae
sorted_models = collections.OrderedDict(sorted(model.items(), key=lambda t: t[1].mae))
check_key_order(sorted_models)
if control.arg.locality == 'global':
base_name, suffix = path.split('/')[-1].split('.')
validation_month = base_name
reduction[validation_month] = sorted_models
all_actuals[validation_month] = actuals
elif control.arg.locality == 'city':
base_name, suffix = path.split('/')[-1].split('.')
validation_month, city_name = base_name.split('-')
# some file systems create all upper case names
# some create mixed-case names
# we map each to upper case
city_name_used = city_name.upper()
reduction[city_name_used][validation_month] = sorted_models
all_actuals[city_name_used][validation_month] = actuals
else:
print 'unexpected locality', control.arg.locality
pdb.set_trace()
counters[path] = counter
if control.debug and len(counters) > 1:
print 'DEBUG: stopping iteration over paths'
break
if control.test:
break
return reduction, all_actuals, counters, all_price_histories
def make_subset_global(reduction, fraction):
'return a random sample of the reduction stratified by validation_month as an ordereddict'
# use same keys (models) every validation month
# generate candidate for common keys in the subset
subset_common_keys = None
for validation_month, validation_dict in reduction.iteritems():
if len(validation_dict) == 0:
print 'zero length validation dict', validation_month
pdb.set_trace()
keys = validation_dict.keys()
n_to_keep = int(len(keys) * fraction)
subset_common_keys_list = random.sample(keys, n_to_keep)
subset_common_keys = set(subset_common_keys_list)
break
# remove keys from subset_common_keys that are not in each validation_month
print 'n candidate common keys', len(subset_common_keys)
for validation_month, validation_dict in reduction.iteritems():
print 'make_subset', validation_month
validation_keys = set(validation_dict.keys())
for key in subset_common_keys:
if key not in validation_keys:
print 'not in', validation_month, ': ', key
subset_common_keys -= set(key)
print 'n final common keys', len(subset_common_keys)
# build reduction subset using the actual common keys
results = {}
for validation_month, validation_dict in reduction.iteritems():
d = {
common_key: validation_dict[common_key]
for common_key in subset_common_keys
}
# sort by MAE, low to high
od = collections.OrderedDict(sorted(d.items(), key=lambda x: x[1].mae))
results[validation_month] = od
return results
def make_subset_city(reduction, path_interesting_cities):
'return reduction for just the interesting cities'
result = {}
if len(reduction) <= 6:
return reduction
with open(path_interesting_cities, 'r') as f:
lines = f.readlines()
no_newlines = [line.rstrip('\n') for line in lines]
for interesting_city in no_newlines:
if interesting_city in reduction:
result[interesting_city] = reduction[interesting_city]
else:
print 'not in reduction', interesting_city
pdb.set_trace()
return result
def make_subset(reduction, fraction, locality, interesting_cities):
'return dict of type type(reduction) but with a randomly chosen subset of size fraction * len(reduction)'
if locality == 'global':
return make_subset_global(reduction, fraction)
elif locality == 'city':
return make_subset_city(reduction, interesting_cities)
else:
print 'bad locality', locality
pdb.set_trace()
def make_norwalk(reduction):
'return dict of type(reduction) with with just the norwalk data items'
city = 'NORWALK'
result = {city: reduction[city]}
return result
def make_median_price(path, cities):
'return dict[Month] median_price or dict[city][Month] median_price'
def median_price(df, month):
in_month = df.month == month
result = df[in_month].price.median()
return result
with open(path, 'rb') as f:
df, reduction_control = pickle.load(f)
all_months = set(df.month)
if cities:
all_cities = set(df.city)
result = collections.defaultdict(dict)
for city in all_cities:
in_city = df.city == city
result[city] = {month: median_price(df[in_city], month) for month in all_months}
else:
result = {month: median_price(df, month) for month in all_months}
return result
class ReportReduction(object):
def __init__(self, counters):
self._report = self._make_report(counters)
def write(self, path):
self._report.write(path)
def _make_report(self, counters):
r = Report()
r.append('Records retained while reducing input file')
for path, counter in counters.iteritems():
r.append(' ')
r.append('path %s' % path)
for tag, value in counter.iteritems():
r.append('%30s: %d' % (tag, value))
return r
def main(argv):
print "what"
control = make_control(argv)
sys.stdout = Logger(logfile_path=control.path_out_log)
print control
lap = control.timer.lap
if control.arg.data:
if not control.debug:
median_price = make_median_price(control.path_in_chart_01_reduction, control.arg.locality == 'city')
lap('make_median_price')
reduction, all_actuals, counters, all_price_histories = make_reduction(control)
lap('make_reduction')
with open(control.path_all_price_histories, 'wb') as f:
pdb.set_trace()
print 'len(all_price_histories)', len(all_price_histories)
print 'columns', all_price_histories.columns
pickle.dump(all_price_histories, f)
lap('write all price histories')
if len(control.errors) > 0:
print 'stopping because of errors'
for error in control.errors:
print error
pdb.set_trace()
lap('make_data')
ReportReduction(counters).write(control.path_out_data_report)
subset = make_subset(reduction, control.sampling_rate, control.arg.locality, control.path_in_interesting_cities)
lap('make_subset')
norwalk = make_norwalk(reduction) if control.arg.locality == 'city' else None
# check key order
def check_validation_month_keys(reduction):
for validation_month in reduction.keys():
check_key_order(reduction[validation_month])
if control.arg.locality == 'global':
check_validation_month_keys(reduction)
check_validation_month_keys(subset)
else:
for city in reduction.keys():
check_validation_month_keys(reduction[city])
for city in subset.keys():
check_validation_month_keys(subset[city])
lap('check key order')
output_all = (reduction, all_actuals, median_price, control)
output_samples = (subset, all_actuals, median_price, control)
output_norwalk = (norwalk, all_actuals, median_price, control)
lap('check key order')
with open(control.path_out_data, 'wb') as f:
pickle.dump(output_all, f)
lap('write all data')
with open(control.path_out_data_subset, 'wb') as f:
pickle.dump(output_samples, f)
lap('write samples')
if control.arg.locality == 'city':
with open(control.path_out_data_norwalk, 'wb') as f:
pickle.dump(output_norwalk, f)
lap('write norwalk')
else:
with open(control.path_in_data, 'rb') as f:
print 'reading reduction data file'
reduction, all_actuals, median_price, reduction_control = pickle.load(f)
lap('read input from %s' % control.path_in_data)
# check that the reduction dictionaries are ordered by mae
def check_order_months(d):
for validation_month, ordered_dict in d.iteritems():
check_key_order(ordered_dict)
if control.arg.locality == 'global':
check_order_months(reduction)
elif control.arg.locality == 'city':
for city, month_dict in reduction.iteritems():
check_order_months(month_dict)
make_charts(reduction, all_actuals, median_price, control)
print control
if control.test:
print 'DISCARD OUTPUT: test'
if control.debug:
print 'DISCARD OUTPUT: debug'
if control.arg.subset:
print 'DISCARD OUTPUT: subset'
if len(control.errors) != 0:
print 'DISCARD OUTPUT: ERRORS'
for error in control.errors:
print error
if len(control.exceptions) != 0:
print 'DISCARD OUTPUT; EXCEPTIONS'
for exception in control.expections:
print exception
print 'done'
return
if __name__ == '__main__':
if False:
# avoid pyflakes warnings
pdb.set_trace()
pprint()
pd.DataFrame()
np.array()
AVM()
ResultValue
main(sys.argv)