Exemple #1
0
def main(argv):
    control = make_control(argv)
    sys.stdout = Logger(base_name=control.arg.base_name)
    print control

    # create dataframes
    parcels_df = parcels.read(control.path, 10000 if control.test else None)
    print 'parcels df shape', parcels_df.shape

    # drop samples without the geographic indicator we will use
    # add zip5 field
    if control.arg.geo == 'zip5':
        parcels_df = parcels_df[parcels.mask_parcel_has_zipcode(parcels_df)]
        parcels_df[parcels.zip5] = pd.Series(data=parcels_df[parcels.zipcode] /
                                             10000.0,
                                             dtype=np.int32,
                                             index=parcels_df.index)
    elif control.arg.geo == 'census_tract':
        # drop if no census tract
        parcels_df = parcels_df[parcels.mask_parcel_has_census_tract(
            parcels_df)]
    else:
        print 'bad control.arg.geo', control.arg.geo
        pdb.set_trace()

    # the computation runs out of memory on 64GB if all columns are retained
    # so drop all but the columns needed
    parcels_df = just_used(control.arg.geo, parcels_df)
    parcels_sfr_df = parcels_df[parcels.mask_is_sfr(parcels_df)]

    print 'parcels sfr df shape', parcels_sfr_df.shape

    parcels_df.index = parcels_df.geo  # the index must be the geo field
    n_unique_indices = parcels_df.index.nunique()
    has_indicators, occurs = make_has_indicators(parcels_df, control.arg.geo)

    print 'has_indicators shape', has_indicators.shape
    print '# of unique geo codes', n_unique_indices
    assert has_indicators.shape[0] == n_unique_indices
    if control.test:
        print has_indicators

    # write the results
    has_indicators.to_csv(control.path_out_csv)
    f = open(control.path_out_occurs, 'wb')
    pickle.dump((occurs, control), f)
    f.close()

    print control
    if control.test:
        print 'DISCARD OUTPUT: test'
    print 'done'

    return
Exemple #2
0
def main(argv):
    control = make_control(argv)
    sys.stdout = Logger(base_name=control.arg.base_name)
    print control

    # create dataframes
    parcels_df = parcels.read(control.path,
                              10000 if control.test else None)
    print 'parcels df shape', parcels_df.shape

    # drop samples without the geographic indicator we will use
    # add zip5 field
    if control.arg.geo == 'zip5':
        parcels_df = parcels_df[parcels.mask_parcel_has_zipcode(parcels_df)]
        parcels_df[parcels.zip5] = pd.Series(data=parcels_df[parcels.zipcode] / 10000.0,
                                             dtype=np.int32,
                                             index=parcels_df.index)
    elif control.arg.geo == 'census_tract':
        # drop if no census tract
        parcels_df = parcels_df[parcels.mask_parcel_has_census_tract(parcels_df)]
    else:
        print 'bad control.arg.geo', control.arg.geo
        pdb.set_trace()

    # the computation runs out of memory on 64GB if all columns are retained
    # so drop all but the columns needed
    parcels_df = just_used(control.arg.geo, parcels_df)
    parcels_sfr_df = parcels_df[parcels.mask_is_sfr(parcels_df)]

    print 'parcels sfr df shape', parcels_sfr_df.shape

    parcels_df.index = parcels_df.geo  # the index must be the geo field
    n_unique_indices = parcels_df.index.nunique()
    has_indicators, occurs = make_has_indicators(parcels_df, control.arg.geo)

    print 'has_indicators shape', has_indicators.shape
    print '# of unique geo codes', n_unique_indices
    assert has_indicators.shape[0] == n_unique_indices
    if control.test:
        print has_indicators

    # write the results
    has_indicators.to_csv(control.path_out_csv)
    f = open(control.path_out_occurs, 'wb')
    pickle.dump((occurs, control), f)
    f.close()

    print control
    if control.test:
        print 'DISCARD OUTPUT: test'
    print 'done'

    return
Exemple #3
0
def main(argv):
    control = make_control(argv)
    sys.stdout = Logger(base_name=control.arg.base_name)
    print control

    # NOTE: Organize the computation to minimize memory usage
    # so that this code can run on smaller-memory systems

    def ps(name, value):
        s = value.shape
        print '  %20s shape (%d, %d)' % (name, s[0], s[1])

    # create dataframes
    n_read_if_test = 10000
    deeds_g_al = deeds.read_g_al(
        control.path,
        n_read_if_test if control.test else None,
    )
    parcels_sfr = parcels.read(
        control.path,
        10000 if control.test else None,
        just_sfr=True,
    )
    ps('original deeds g al', deeds_g_al)
    ps('original parcels sfr', parcels_sfr)

    # augment parcels to include a zip5 field (5-digit zip code)
    # drop samples without a zipcode
    # rationale: we use the zip5 to join the features derived from parcels
    # and zip5 is derived from zipcode
    zipcode_present = parcels_sfr[parcels.zipcode].notnull()
    parcels_sfr = parcels_sfr[zipcode_present]
    parcels.add_zip5(parcels_sfr)

    # augment parcels and deeds to include a better APN
    print 'adding best apn column for parcels'
    new_column_parcels = best_apn(parcels_sfr, parcels.apn_formatted,
                                  parcels.apn_unformatted)
    parcels_sfr.loc[:, parcels.
                    best_apn] = new_column_parcels  # generates an ignorable warning

    print 'adding best apn column for deeds'
    new_column_deeds = best_apn(deeds_g_al, deeds.apn_formatted,
                                deeds.apn_unformatted)
    deeds_g_al.loc[:, deeds.best_apn] = new_column_deeds

    ps('revised deeds_g_al', deeds_g_al)
    ps('revised parcels_sfr', parcels_sfr)

    # join the deeds and parcels files
    print 'starting to merge'
    check_feature_names(deeds_g_al)
    check_feature_names(parcels_sfr)
    m1 = deeds_g_al.merge(parcels_sfr,
                          how='inner',
                          left_on=deeds.best_apn,
                          right_on=parcels.best_apn,
                          suffixes=('_deed', '_parcel'))
    check_feature_names(m1)
    del deeds_g_al
    del parcels_sfr
    ps('m1 merge deed + parcels', m1)

    # add in derived parcels features
    m2 = parcels_derived_features(control, m1)
    check_feature_names(m2)
    ps('ms added parcels_derived', m2)
    del m1

    # add in census data
    census_features_df = read_census_features(control)
    m3 = m2.merge(
        census_features_df,
        left_on=transactions.census_tract,
        right_on="census_tract",
    )
    assert 'census_tract_x' in m3.columns
    assert 'census_tract_y' in m3.columns
    assert (m3.census_tract_x == m3.census_tract_y).all()
    assert 'census_tract' not in m3.columns
    pu.df_remove_column(m3, 'census_tract_x')
    pu.df_rename_column(m3, 'census_tract_y', 'census_tract')
    check_feature_names(m3)
    del m2
    ps('m3 merged census features', m3)

    # add in GPS coordinates
    geocoding_df = read_geocoding(control)
    m4 = m3.merge(
        geocoding_df,
        left_on="best_apn",
        right_on="G APN",
    )
    del geocoding_df
    del m3
    ps('m4 merged geocoding', m4)

    final = m4
    print 'final columns'
    for c in final.columns:
        print c,
    print

    cc('fraction',
       final)  # verify that fraction_owner_occupied is in the output
    print 'final shape', final.shape

    # write merged,augmented dataframe
    print 'writing final dataframe to csv file'
    final.to_csv(control.path_out_transactions)

    # write out all the column names
    print 'all column names in final dataframe'
    for name in final.columns:
        print name
        if '_y' in name:
            print 'found strange suffix'
            pdb.set_trace()

    print control
    if control.test:
        print 'DISCARD OUTPUT: test'
    print 'done'

    return
Exemple #4
0
def main(argv):
    control = make_control(argv)
    sys.stdout = Logger(base_name=control.arg.base_name)
    print control

    # NOTE: Organize the computation to minimize memory usage
    # so that this code can run on smaller-memory systems

    def ps(name, value):
        s = value.shape
        print '  %20s shape (%d, %d)' % (name, s[0], s[1])

    # create dataframes
    n_read_if_test = 10000
    deeds_g_al = deeds.read_g_al(
        control.path,
        n_read_if_test if control.test else None,
    )
    parcels_sfr = parcels.read(
        control.path,
        10000 if control.test else None,
        just_sfr=True,
    )
    ps('original deeds g al', deeds_g_al)
    ps('original parcels sfr', parcels_sfr)

    # augment parcels to include a zip5 field (5-digit zip code)
    # drop samples without a zipcode
    # rationale: we use the zip5 to join the features derived from parcels
    # and zip5 is derived from zipcode
    zipcode_present = parcels_sfr[parcels.zipcode].notnull()
    parcels_sfr = parcels_sfr[zipcode_present]
    parcels.add_zip5(parcels_sfr)

    # augment parcels and deeds to include a better APN
    print 'adding best apn column for parcels'
    new_column_parcels = best_apn(parcels_sfr, parcels.apn_formatted, parcels.apn_unformatted)
    parcels_sfr.loc[:, parcels.best_apn] = new_column_parcels  # generates an ignorable warning

    print 'adding best apn column for deeds'
    new_column_deeds = best_apn(deeds_g_al, deeds.apn_formatted, deeds.apn_unformatted)
    deeds_g_al.loc[:, deeds.best_apn] = new_column_deeds

    ps('revised deeds_g_al', deeds_g_al)
    ps('revised parcels_sfr', parcels_sfr)

    # join the deeds and parcels files
    print 'starting to merge'
    check_feature_names(deeds_g_al)
    check_feature_names(parcels_sfr)
    m1 = deeds_g_al.merge(parcels_sfr, how='inner',
                          left_on=deeds.best_apn, right_on=parcels.best_apn,
                          suffixes=('_deed', '_parcel'))
    check_feature_names(m1)
    del deeds_g_al
    del parcels_sfr
    ps('m1 merge deed + parcels', m1)

    # add in derived parcels features
    m2 = parcels_derived_features(control, m1)
    check_feature_names(m2)
    ps('ms added parcels_derived', m2)
    del m1

    # add in census data
    census_features_df = read_census_features(control)
    m3 = m2.merge(census_features_df,
                  left_on=transactions.census_tract,
                  right_on="census_tract",
                  )
    assert 'census_tract_x' in m3.columns
    assert 'census_tract_y' in m3.columns
    assert (m3.census_tract_x == m3.census_tract_y).all()
    assert 'census_tract' not in m3.columns
    pu.df_remove_column(m3, 'census_tract_x')
    pu.df_rename_column(m3, 'census_tract_y', 'census_tract')
    check_feature_names(m3)
    del m2
    ps('m3 merged census features', m3)

    # add in GPS coordinates
    geocoding_df = read_geocoding(control)
    m4 = m3.merge(geocoding_df,
                  left_on="best_apn",
                  right_on="G APN",
                  )
    del geocoding_df
    del m3
    ps('m4 merged geocoding', m4)

    final = m4
    print 'final columns'
    for c in final.columns:
        print c,
    print

    cc('fraction', final)  # verify that fraction_owner_occupied is in the output
    print 'final shape', final.shape

    # write merged,augmented dataframe
    print 'writing final dataframe to csv file'
    final.to_csv(control.path_out_transactions)

    # write out all the column names
    print 'all column names in final dataframe'
    for name in final.columns:
        print name
        if '_y' in name:
            print 'found strange suffix'
            pdb.set_trace()

    print control
    if control.test:
        print 'DISCARD OUTPUT: test'
    print 'done'

    return