def main(argv): control = make_control(argv) sys.stdout = Logger(base_name=control.arg.base_name) print control # create dataframes parcels_df = parcels.read(control.path, 10000 if control.test else None) print 'parcels df shape', parcels_df.shape # drop samples without the geographic indicator we will use # add zip5 field if control.arg.geo == 'zip5': parcels_df = parcels_df[parcels.mask_parcel_has_zipcode(parcels_df)] parcels_df[parcels.zip5] = pd.Series(data=parcels_df[parcels.zipcode] / 10000.0, dtype=np.int32, index=parcels_df.index) elif control.arg.geo == 'census_tract': # drop if no census tract parcels_df = parcels_df[parcels.mask_parcel_has_census_tract( parcels_df)] else: print 'bad control.arg.geo', control.arg.geo pdb.set_trace() # the computation runs out of memory on 64GB if all columns are retained # so drop all but the columns needed parcels_df = just_used(control.arg.geo, parcels_df) parcels_sfr_df = parcels_df[parcels.mask_is_sfr(parcels_df)] print 'parcels sfr df shape', parcels_sfr_df.shape parcels_df.index = parcels_df.geo # the index must be the geo field n_unique_indices = parcels_df.index.nunique() has_indicators, occurs = make_has_indicators(parcels_df, control.arg.geo) print 'has_indicators shape', has_indicators.shape print '# of unique geo codes', n_unique_indices assert has_indicators.shape[0] == n_unique_indices if control.test: print has_indicators # write the results has_indicators.to_csv(control.path_out_csv) f = open(control.path_out_occurs, 'wb') pickle.dump((occurs, control), f) f.close() print control if control.test: print 'DISCARD OUTPUT: test' print 'done' return
def main(argv): control = make_control(argv) sys.stdout = Logger(base_name=control.arg.base_name) print control # create dataframes parcels_df = parcels.read(control.path, 10000 if control.test else None) print 'parcels df shape', parcels_df.shape # drop samples without the geographic indicator we will use # add zip5 field if control.arg.geo == 'zip5': parcels_df = parcels_df[parcels.mask_parcel_has_zipcode(parcels_df)] parcels_df[parcels.zip5] = pd.Series(data=parcels_df[parcels.zipcode] / 10000.0, dtype=np.int32, index=parcels_df.index) elif control.arg.geo == 'census_tract': # drop if no census tract parcels_df = parcels_df[parcels.mask_parcel_has_census_tract(parcels_df)] else: print 'bad control.arg.geo', control.arg.geo pdb.set_trace() # the computation runs out of memory on 64GB if all columns are retained # so drop all but the columns needed parcels_df = just_used(control.arg.geo, parcels_df) parcels_sfr_df = parcels_df[parcels.mask_is_sfr(parcels_df)] print 'parcels sfr df shape', parcels_sfr_df.shape parcels_df.index = parcels_df.geo # the index must be the geo field n_unique_indices = parcels_df.index.nunique() has_indicators, occurs = make_has_indicators(parcels_df, control.arg.geo) print 'has_indicators shape', has_indicators.shape print '# of unique geo codes', n_unique_indices assert has_indicators.shape[0] == n_unique_indices if control.test: print has_indicators # write the results has_indicators.to_csv(control.path_out_csv) f = open(control.path_out_occurs, 'wb') pickle.dump((occurs, control), f) f.close() print control if control.test: print 'DISCARD OUTPUT: test' print 'done' return
def main(argv): control = make_control(argv) sys.stdout = Logger(base_name=control.arg.base_name) print control # NOTE: Organize the computation to minimize memory usage # so that this code can run on smaller-memory systems def ps(name, value): s = value.shape print ' %20s shape (%d, %d)' % (name, s[0], s[1]) # create dataframes n_read_if_test = 10000 deeds_g_al = deeds.read_g_al( control.path, n_read_if_test if control.test else None, ) parcels_sfr = parcels.read( control.path, 10000 if control.test else None, just_sfr=True, ) ps('original deeds g al', deeds_g_al) ps('original parcels sfr', parcels_sfr) # augment parcels to include a zip5 field (5-digit zip code) # drop samples without a zipcode # rationale: we use the zip5 to join the features derived from parcels # and zip5 is derived from zipcode zipcode_present = parcels_sfr[parcels.zipcode].notnull() parcels_sfr = parcels_sfr[zipcode_present] parcels.add_zip5(parcels_sfr) # augment parcels and deeds to include a better APN print 'adding best apn column for parcels' new_column_parcels = best_apn(parcels_sfr, parcels.apn_formatted, parcels.apn_unformatted) parcels_sfr.loc[:, parcels. best_apn] = new_column_parcels # generates an ignorable warning print 'adding best apn column for deeds' new_column_deeds = best_apn(deeds_g_al, deeds.apn_formatted, deeds.apn_unformatted) deeds_g_al.loc[:, deeds.best_apn] = new_column_deeds ps('revised deeds_g_al', deeds_g_al) ps('revised parcels_sfr', parcels_sfr) # join the deeds and parcels files print 'starting to merge' check_feature_names(deeds_g_al) check_feature_names(parcels_sfr) m1 = deeds_g_al.merge(parcels_sfr, how='inner', left_on=deeds.best_apn, right_on=parcels.best_apn, suffixes=('_deed', '_parcel')) check_feature_names(m1) del deeds_g_al del parcels_sfr ps('m1 merge deed + parcels', m1) # add in derived parcels features m2 = parcels_derived_features(control, m1) check_feature_names(m2) ps('ms added parcels_derived', m2) del m1 # add in census data census_features_df = read_census_features(control) m3 = m2.merge( census_features_df, left_on=transactions.census_tract, right_on="census_tract", ) assert 'census_tract_x' in m3.columns assert 'census_tract_y' in m3.columns assert (m3.census_tract_x == m3.census_tract_y).all() assert 'census_tract' not in m3.columns pu.df_remove_column(m3, 'census_tract_x') pu.df_rename_column(m3, 'census_tract_y', 'census_tract') check_feature_names(m3) del m2 ps('m3 merged census features', m3) # add in GPS coordinates geocoding_df = read_geocoding(control) m4 = m3.merge( geocoding_df, left_on="best_apn", right_on="G APN", ) del geocoding_df del m3 ps('m4 merged geocoding', m4) final = m4 print 'final columns' for c in final.columns: print c, print cc('fraction', final) # verify that fraction_owner_occupied is in the output print 'final shape', final.shape # write merged,augmented dataframe print 'writing final dataframe to csv file' final.to_csv(control.path_out_transactions) # write out all the column names print 'all column names in final dataframe' for name in final.columns: print name if '_y' in name: print 'found strange suffix' pdb.set_trace() print control if control.test: print 'DISCARD OUTPUT: test' print 'done' return
def main(argv): control = make_control(argv) sys.stdout = Logger(base_name=control.arg.base_name) print control # NOTE: Organize the computation to minimize memory usage # so that this code can run on smaller-memory systems def ps(name, value): s = value.shape print ' %20s shape (%d, %d)' % (name, s[0], s[1]) # create dataframes n_read_if_test = 10000 deeds_g_al = deeds.read_g_al( control.path, n_read_if_test if control.test else None, ) parcels_sfr = parcels.read( control.path, 10000 if control.test else None, just_sfr=True, ) ps('original deeds g al', deeds_g_al) ps('original parcels sfr', parcels_sfr) # augment parcels to include a zip5 field (5-digit zip code) # drop samples without a zipcode # rationale: we use the zip5 to join the features derived from parcels # and zip5 is derived from zipcode zipcode_present = parcels_sfr[parcels.zipcode].notnull() parcels_sfr = parcels_sfr[zipcode_present] parcels.add_zip5(parcels_sfr) # augment parcels and deeds to include a better APN print 'adding best apn column for parcels' new_column_parcels = best_apn(parcels_sfr, parcels.apn_formatted, parcels.apn_unformatted) parcels_sfr.loc[:, parcels.best_apn] = new_column_parcels # generates an ignorable warning print 'adding best apn column for deeds' new_column_deeds = best_apn(deeds_g_al, deeds.apn_formatted, deeds.apn_unformatted) deeds_g_al.loc[:, deeds.best_apn] = new_column_deeds ps('revised deeds_g_al', deeds_g_al) ps('revised parcels_sfr', parcels_sfr) # join the deeds and parcels files print 'starting to merge' check_feature_names(deeds_g_al) check_feature_names(parcels_sfr) m1 = deeds_g_al.merge(parcels_sfr, how='inner', left_on=deeds.best_apn, right_on=parcels.best_apn, suffixes=('_deed', '_parcel')) check_feature_names(m1) del deeds_g_al del parcels_sfr ps('m1 merge deed + parcels', m1) # add in derived parcels features m2 = parcels_derived_features(control, m1) check_feature_names(m2) ps('ms added parcels_derived', m2) del m1 # add in census data census_features_df = read_census_features(control) m3 = m2.merge(census_features_df, left_on=transactions.census_tract, right_on="census_tract", ) assert 'census_tract_x' in m3.columns assert 'census_tract_y' in m3.columns assert (m3.census_tract_x == m3.census_tract_y).all() assert 'census_tract' not in m3.columns pu.df_remove_column(m3, 'census_tract_x') pu.df_rename_column(m3, 'census_tract_y', 'census_tract') check_feature_names(m3) del m2 ps('m3 merged census features', m3) # add in GPS coordinates geocoding_df = read_geocoding(control) m4 = m3.merge(geocoding_df, left_on="best_apn", right_on="G APN", ) del geocoding_df del m3 ps('m4 merged geocoding', m4) final = m4 print 'final columns' for c in final.columns: print c, print cc('fraction', final) # verify that fraction_owner_occupied is in the output print 'final shape', final.shape # write merged,augmented dataframe print 'writing final dataframe to csv file' final.to_csv(control.path_out_transactions) # write out all the column names print 'all column names in final dataframe' for name in final.columns: print name if '_y' in name: print 'found strange suffix' pdb.set_trace() print control if control.test: print 'DISCARD OUTPUT: test' print 'done' return