forked from rlowrance/re-avm
-
Notifications
You must be signed in to change notification settings - Fork 0
/
predict.py
183 lines (156 loc) · 5.81 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
'''predict using many fitted models on the real estate data
INVOCATION
python fit.py DATA MODEL TRANSACTIONMONTH NEIGHBORHOOD
where
SAMPLES in {train, all} specifies which data in Working/samples2 to use
MODEL in {en, gb, rf} specified which model to use
TRANSACTIONMONTH like YYYYMM specfies the last month of training data
NEIGHBORHOOD in {all, city_name} specifies whether to train a model on all cities or just the specified city
which will fit all observations in the MONTH for all fitted models of kind MODEL
INPUTS
WORKING/samples2/train.csv or WORKING/samples2/all.csv
WORKING/fit/DATA-MODEL-{TRANSACTIONMONTH - 1}-NEIGHBHOOD/*.pickle the fitted models
OUTPUTS
WORKING/predict[-item]/SAMPLES-MODEL-TRANSACTIONMONTH-NEIGHBORHOOD/predictions.pickle a dict
'''
from __future__ import division
import argparse
import collections
import cPickle as pickle
import datetime
import itertools
import numpy as np
import os
import pandas as pd
import pdb
from pprint import pprint
import random
import sklearn
import sys
import time
import arg_type
import AVM
from Bunch import Bunch
from columns_contain import columns_contain
import dirutility
from Features import Features
import HPs
import layout_transactions
from Logger import Logger
from Month import Month
from Path import Path
from Report import Report
from SampleSelector import SampleSelector
from valavmtypes import ResultKeyEn, ResultKeyGbr, ResultKeyRfr, ResultValue
from Timer import Timer
# from TimeSeriesCV import TimeSeriesCV
cc = columns_contain
def make_control(argv):
'return a Bunch'
print argv
parser = argparse.ArgumentParser()
parser.add_argument('invocation')
parser.add_argument('samples', choices=['all', 'train'])
parser.add_argument('model', choices=['en', 'gb', 'rf'])
parser.add_argument('transaction_month')
parser.add_argument('neighborhood')
parser.add_argument('--test', action='store_true')
parser.add_argument('--trace', action='store_true')
arg = parser.parse_args(argv)
arg.me = arg.invocation.split('.')[0]
if arg.trace:
pdb.set_trace()
# convert arg.neighborhood into arg.all and arg.city
arg.city = (
None if arg.neighborhood == 'all' else
arg.neighborhood.replace('_', ' ')
)
random_seed = 123
random.seed(random_seed)
prior_month = Month(arg.transaction_month).decrement().as_str()
in_dir = '%s-%s-%s-%s' % (arg.samples, arg.model, prior_month, arg.neighborhood)
out_dir = '%s-%s-%s-%s' % (arg.samples, arg.model, arg.transaction_month, arg.neighborhood)
dir_working = Path().dir_working()
output_dir = (
os.path.join(dir_working, arg.me + '-test', out_dir, '') if arg.test else
os.path.join(dir_working, arg.me, out_dir, '')
)
dirutility.assure_exists(output_dir)
return Bunch(
arg=arg,
path_in_fitted=os.path.join(dir_working, 'fit', in_dir, ''),
path_in_samples=os.path.join(dir_working, 'samples2', arg.samples + '.csv'),
path_out_file=os.path.join(output_dir, 'predictions.pickle'),
path_out_log=os.path.join(output_dir, '0log.txt'),
random_seed=random_seed,
timer=Timer(),
)
def do_work(control):
'write predictions to output csv file'
samples = pd.read_csv(
control.path_in_samples,
nrows=10 if control.arg.test else None,
usecols=None, # TODO: change to columns we actually use
low_memory=False,
)
apns = samples[layout_transactions.apn]
sale_dates = samples[layout_transactions.sale_date]
print 'read %d rows of samples from file %s' % (len(samples), control.path_in_samples)
# iterate over the fitted models
hps_predictions = {}
for root, dirnames, filenames in os.walk(control.path_in_fitted):
assert len(dirnames) == 0, dirnames
print root, len(filenames)
for filename in filenames:
suffix_we_process = '.pickle'
if not filename.endswith(suffix_we_process):
print 'skipping file without a fitted model: %s' % filename
continue
hps_string = filename[:-len(suffix_we_process)]
hps = HPs.from_str(hps_string)
path_to_file = os.path.join(root, filename)
with open(path_to_file, 'r') as f:
ok, fitted_model = pickle.load(f)
if ok:
print 'predicting samples using fitted model %s' % filename
X, y = Features().extract_and_transform(samples, hps['units_X'], hps['units_y'])
predictions = fitted_model.predict(X)
assert len(predictions) == len(samples)
assert hps_string not in hps_predictions
hps_predictions[hps_string] = predictions
else:
print 'not not predict samples using fitted model %s; reason: %s' % (
filename,
fitted_model, # an error message
)
# have all the predictions for all filenames (= a set of hyperparameters)
print 'walked all %d files' % len(filenames)
out = {
'apns': apns,
'sale_dates': sale_dates,
'hps_predictions': hps_predictions,
}
with open(control.path_out_file, 'w') as f:
pickle.dump(out, f)
print 'wr0te results to %s' % control.path_out_file
return
def main(argv):
control = make_control(argv)
sys.stdout = Logger(control.path_out_log) # now print statements also write to the log file
print control
lap = control.timer.lap
do_work(control)
lap('work completed')
if control.arg.test:
print 'DISCARD OUTPUT: test'
print control
print 'done'
return
if __name__ == '__main__':
if False:
# avoid pyflakes warnings
pdb.set_trace()
pprint()
pd.DataFrame()
np.array()
main(sys.argv)