forked from rlowrance/re-avm
/
samples.py
363 lines (301 loc) · 12.8 KB
/
samples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
'''create training and test samples from the transactions
* add certain fields
* select transactions that contain "reasonable" values
INPUT FILES
INPUT/transactions-al-g-sfr.csv
OUTPUT FILES
WORKING/samples-test.csv
WORKING/samples-train.csv
WORKING/samples-train-validate.csv
WORKING/samples-validate.csv
'''
from __future__ import division
import cPickle as pickle
import datetime
import numpy as np
import pandas as pd
import pdb
from pprint import pprint
import random
from sklearn import cross_validation
import sys
from Bunch import Bunch
from columns_contain import columns_contain
from Features import Features
from Logger import Logger
from ParseCommandLine import ParseCommandLine
from Path import Path
import layout_transactions as layout
cc = columns_contain
def usage(msg=None):
if msg is not None:
print msg
print 'usage : python samples.py [--test]'
print ' --test: run in test mode'
sys.exit(1)
def make_control(argv):
# return a Bunch
print argv
if len(argv) not in (1, 2):
usage('invalid number of arguments')
pcl = ParseCommandLine(argv)
arg = Bunch(
base_name=argv[0].split('.')[0],
test=pcl.has_arg('--test'),
)
random_seed = 123
random.seed(random_seed)
dir_working = Path().dir_working()
debug = False
out_file_name_base = ('testing-' if arg.test else '') + arg.base_name
return Bunch(
arg=arg,
debug=debug,
fraction_test=0.2,
max_sale_price=85e6, # according to Wall Street Journal
path_in=dir_working + 'transactions-al-g-sfr.csv',
path_out_info_reasonable=dir_working + out_file_name_base + '-info-reasonable.pickle',
path_out_test=dir_working + out_file_name_base + '-test.csv',
path_out_train=dir_working + out_file_name_base + '-train.csv',
path_out_train_validate=dir_working + out_file_name_base + '-train-validate.csv',
path_out_validate=dir_working + out_file_name_base + '-validate.csv',
random_seed=random_seed,
test=arg.test,
)
def report_and_remove(df, keep_masks):
'return new dataframe with just the kept rows AND info in the table that is printed'
print 'impact of individual masks'
format = '%40s removed %6d samples (%3d%%)'
info = {}
sorted_names = sorted([name for name in keep_masks.keys()])
for name in sorted_names:
keep_mask = keep_masks[name]
n_removed = len(df) - sum(keep_mask)
fraction_removed = n_removed / len(df)
print format % (name, n_removed, 100.0 * fraction_removed)
info[name] = fraction_removed
mm = reduce(lambda a, b: a & b, keep_masks.values())
total_removed = len(df) - sum(mm)
total_fraction_removed = total_removed / len(df)
print format % ('*** in combination', total_removed, 100.0 * total_fraction_removed)
r = df[mm]
return r, info
def check_never_missing(df, feature_names):
'verify that each ege feature is always present'
print
print 'Each of these fields should never be missing'
total_missing = 0
format_string = 'field %40s is missing %7d times'
for name in feature_names:
count_missing = sum(pd.isnull(df[name]))
print format_string % (name, count_missing)
total_missing += count_missing
print format_string % ('** total across fields **', total_missing)
print 'total missing', total_missing
if total_missing > 0:
pdb.set_trace() # should not be any missing
pass
def check_no_zeros(df, feature_names):
'check that only a few expected fields have zero values'
# TODO: some of the features can have zero values!
total_zeros = 0
format_string = 'field %40s is zero %7d times'
for name in feature_names:
if ('_has_' in name) or ('is_' in name):
continue # these are 0/1 indicator features
if name in (layout.building_bedrooms,
layout.building_basement_square_feet,
layout.building_fireplace_number,
layout.census2000_fraction_owner_occupied,
layout.has_pool,
layout.parking_spaces,):
continue # also, these should be often zero
count_zero = sum(df[name] == 0)
print format_string % (name, count_zero)
total_zeros += count_zero
print format_string % ('** total across fields **', total_zeros)
if total_zeros > 0:
print 'found some unexpected zero values'
pdb.set_trace() # should not be any zeros
def check_feature_values(df):
# age features are added just before a model is fitted
feature_names = sorted([x for x, y in Features().ege() if 'age' not in x])
check_never_missing(df, feature_names)
check_no_zeros(df, feature_names)
def reasonable_feature_values(df, control):
'return new DataFrame containing sample in df that have "reasonable" values'
def below(percentile, series):
quantile_value = series.quantile(percentile / 100.0)
r = series < quantile_value
return r
# set mask value in m to True to keep the observation
m = {}
m['assessment total > 0'] = df[layout.assessment_total] > 0
m['assessment land > 0'] = df[layout.assessment_land] > 0
m['assessment improvement > 0'] = df[layout.assessment_improvement] > 0
m['baths > 0'] = df[layout.building_baths] > 0
m['effective year built >= year built'] = df[layout.year_built_effective] >= df[layout.year_built]
m['full price'] = layout.mask_full_price(df)
m['latitude known'] = layout.mask_gps_latitude_known(df)
m['longitude known'] = layout.mask_gps_longitude_known(df)
m['land size < 99th percentile'] = below(99, df[layout.lot_land_square_feet])
m['land size > 0'] = df[layout.lot_land_square_feet] > 0
m['living size < 99th percentile'] = below(99, df[layout.building_living_square_feet])
m['living square feet > 0'] = df[layout.building_living_square_feet] > 0
m['median household income > 0'] = df[layout.census2000_median_household_income] > 0
m['new or resale'] = layout.mask_new_or_resale(df)
m['one building'] = layout.mask_is_one_building(df)
m['one APN'] = layout.mask_is_one_parcel(df)
# m['recording date present'] = ~df[layout.recording_date + '_deed'].isnull() # ~ => not
m['price > 0'] = df[layout.price] > 0
m['price < max'] = df[layout.price] < control.max_sale_price
m['rooms > 0'] = df[layout.building_rooms] > 0
m['resale or new construction'] = (
layout.mask_is_new_construction(df) |
layout.mask_is_resale(df)
)
m['sale date present'] = layout.mask_sale_date_present(df)
m['sale date valid'] = layout.mask_sale_date_valid(df)
m['stories > 0'] = df[layout.building_stories] > 0
m['units == 1'] = df[layout.n_units] == 1
m['year_built > 0'] = df[layout.year_built] > 0
print 'effects of reasonable values'
return report_and_remove(df, m)
def add_features(df, control):
def split(date):
year = int(date / 10000)
md = date - year * 10000
month = int(md / 100)
day = md - month * 100
return year, month, day
def python_date(date):
'yyyymmdd --> datetime.date(year, month, day)'
year, month, day = split(date)
return datetime.date(int(year), int(month), int(day))
def year(date):
'yyyymmdd --> datetime.date(year, 7, 1); mid point of year'
year, month, day = split(date)
return year
def yyyymm(date):
year, month, day = split(date)
return year * 100 + month
# create sale-date related features
def append_column(name, values):
df.insert(len(df.columns),
name,
pd.Series(values, index=df.index),
)
sale_date = df[layout.sale_date]
sale_date_python = sale_date.apply(python_date)
append_column(layout.sale_date_python, sale_date_python)
append_column(layout.yyyymm, sale_date.apply(yyyymm))
# create age and similar fields
# NOTE: these are ages at the sale date
sale_year = sale_date.apply(year)
year_built = df[layout.year_built]
effective_year_built = df[layout.year_built_effective]
age = sale_year - year_built
effective_age = sale_year - effective_year_built
append_column(layout.age, age)
append_column(layout.age2, age * age)
append_column(layout.age_effective, effective_age)
append_column(layout.age_effective2, effective_age * effective_age)
# create indicator features
append_column(layout.is_new_construction, layout.mask_is_new_construction(df))
append_column(layout.is_resale, layout.mask_is_resale(df))
append_column(layout.building_has_basement, df[layout.building_basement_square_feet] > 0)
append_column(layout.building_has_fireplace, df[layout.building_fireplace_number] > 0)
append_column(layout.has_parking, df[layout.parking_spaces] > 0)
append_column(layout.has_pool, df[layout.pool_flag] == 'Y')
# create additional indicators aggregating certain PROPN codes
def create(new_column_base, ored_column_bases):
def create2(prefix):
def ored_name(ored_index):
return prefix + '_has_' + ored_column_bases[ored_index]
mask = df[ored_name(0)]
for index in range(1, len(ored_column_bases)):
mask2 = df[ored_name(index)]
mask = mask | mask2
append_column(prefix + '_has_' + new_column_base, mask)
for prefix in ('census_tract', 'zip5'):
create2(prefix)
create('any_commercial', ('commercial', 'commercial_condominium',))
create('any_industrial', ('industrial', 'industrial_light', 'industrial_heavy',))
create('any_non_residential', ('amusement', 'any_commercial', 'financial_institution', 'hotel',
'any_industrial', 'medical', 'office_building', 'parking',
'retail', 'service', 'transport', 'utilities', 'warehouse',))
def main(argv):
control = make_control(argv)
sys.stdout = Logger(base_name=control.arg.base_name)
print control
transactions = pd.read_csv(control.path_in,
nrows=100000 if control.arg.test else None,
)
print 'transactions shape', transactions.shape
after_2000_census_known = transactions[layout.mask_sold_after_2002(transactions)]
print 'after 2000 census known shape', after_2000_census_known.shape
subset, info_reasonable = reasonable_feature_values(after_2000_census_known, control)
print 'subset shape', subset.shape
add_features(subset, control)
check_feature_values(subset)
# split into test and train
# stratify by yyyymm (month of sale)
def count_yyyymm(df, yyyymm):
return sum(df[layout.yyyymm] == yyyymm)
def split(df, fraction_test):
sss = cross_validation.StratifiedShuffleSplit(
y=df.yyyymm,
n_iter=1,
test_size=fraction_test,
train_size=None,
random_state=control.random_seed,
)
assert len(sss) == 1
for train_index, test_index in sss:
train = df.iloc[train_index]
test = df.iloc[test_index]
return test, train
# split samples into test and train, stratified on month of sale
# then split training data on test (validate) and train
test, train = split(subset, control.fraction_test)
fraction_test = control.fraction_test / (1 - control.fraction_test)
train_test, train_train = split(train, fraction_test)
# write the csv files
test.to_csv(control.path_out_test)
train.to_csv(control.path_out_train)
train_test.to_csv(control.path_out_train_validate)
train_train.to_csv(control.path_out_validate)
# count samples in each strata (= month)
yyyymms = sorted(set(subset[layout.yyyymm]))
format_string = '%6d # total %6d # test %6d # train %6d # train_test %6d # train_train %6d'
for yyyymm in yyyymms:
c1 = count_yyyymm(subset, yyyymm)
c2 = count_yyyymm(test, yyyymm)
c3 = count_yyyymm(train, yyyymm)
c4 = count_yyyymm(train_test, yyyymm)
c5 = count_yyyymm(train_train, yyyymm)
print format_string % (yyyymm, c1, c2, c3, c4, c5)
if c1 != c2 + c3:
print 'c1 not exactly split'
pdb.set_trace()
if c3 != c4 + c5:
print 'c3 not exactly split'
pdb.set_trace()
print 'totals'
print format_string % (0, len(subset), len(test), len(train), len(train_train), len(train_test))
f = open(control.path_out_info_reasonable, 'wb')
pickle.dump((info_reasonable, control), f)
f.close()
print control
if control.test:
print 'DISCARD OUTPUT: test'
print 'done'
return
if __name__ == '__main__':
if False:
# avoid pyflakes warnings
pdb.set_trace()
pprint()
pd.DataFrame()
np.array()
main(sys.argv)