Example #1
0
def parcels_derived_features(control, transactions_df):
    'return new df by merging df and the geo features'

    # merge in  census tract features
    census_tract_df = pd.read_csv(
        control.path_in_parcels_features_census_tract, index_col=0)
    check_feature_names(transactions_df)
    check_feature_names(census_tract_df)
    m1 = transactions_df.merge(
        census_tract_df,
        how='inner',
        left_on=transactions_df[transactions.census_tract],
        right_on=census_tract_df.census_tract,
    )
    check_feature_names(m1)
    print 'm1 shape', m1.shape
    cc('commercial', m1)

    # merge in zip5 features
    zip5_df = pd.read_csv(control.path_in_parcels_features_zip5, index_col=0)
    check_feature_names(m1)
    check_feature_names(zip5_df)
    m2 = m1.merge(
        zip5_df,
        how='inner',
        left_on=m1[transactions.zip5],
        right_on=zip5_df.zip5,
    )

    # remove duplicated field zip5_x, zip5_y
    assert 'zip5_x' in m2.columns
    assert 'zip5_y' in m2.columns
    assert (m2.zip5_x == m2.zip5_y).all()
    assert 'zip5' not in m2.columns
    pu.df_remove_column(m2, 'zip5_x')
    pu.df_rename_column(m2, 'zip5_y', 'zip5')
    check_feature_names(m2)

    print 'm2 shape', m2.shape

    return m2
Example #2
0
def parcels_derived_features(control, transactions_df):
    'return new df by merging df and the geo features'

    # merge in  census tract features
    census_tract_df = pd.read_csv(control.path_in_parcels_features_census_tract, index_col=0)
    check_feature_names(transactions_df)
    check_feature_names(census_tract_df)
    m1 = transactions_df.merge(
        census_tract_df,
        how='inner',
        left_on=transactions_df[transactions.census_tract],
        right_on=census_tract_df.census_tract,
    )
    check_feature_names(m1)
    print 'm1 shape', m1.shape
    cc('commercial', m1)

    # merge in zip5 features
    zip5_df = pd.read_csv(control.path_in_parcels_features_zip5, index_col=0)
    check_feature_names(m1)
    check_feature_names(zip5_df)
    m2 = m1.merge(
        zip5_df,
        how='inner',
        left_on=m1[transactions.zip5],
        right_on=zip5_df.zip5,
    )

    # remove duplicated field zip5_x, zip5_y
    assert 'zip5_x' in m2.columns
    assert 'zip5_y' in m2.columns
    assert (m2.zip5_x == m2.zip5_y).all()
    assert 'zip5' not in m2.columns
    pu.df_remove_column(m2, 'zip5_x')
    pu.df_rename_column(m2, 'zip5_y', 'zip5')
    check_feature_names(m2)

    print 'm2 shape', m2.shape

    return m2
Example #3
0
def main(argv):
    control = make_control(argv)
    sys.stdout = Logger(base_name=control.arg.base_name)
    print control

    # NOTE: Organize the computation to minimize memory usage
    # so that this code can run on smaller-memory systems

    def ps(name, value):
        s = value.shape
        print '  %20s shape (%d, %d)' % (name, s[0], s[1])

    # create dataframes
    n_read_if_test = 10000
    deeds_g_al = deeds.read_g_al(
        control.path,
        n_read_if_test if control.test else None,
    )
    parcels_sfr = parcels.read(
        control.path,
        10000 if control.test else None,
        just_sfr=True,
    )
    ps('original deeds g al', deeds_g_al)
    ps('original parcels sfr', parcels_sfr)

    # augment parcels to include a zip5 field (5-digit zip code)
    # drop samples without a zipcode
    # rationale: we use the zip5 to join the features derived from parcels
    # and zip5 is derived from zipcode
    zipcode_present = parcels_sfr[parcels.zipcode].notnull()
    parcels_sfr = parcels_sfr[zipcode_present]
    parcels.add_zip5(parcels_sfr)

    # augment parcels and deeds to include a better APN
    print 'adding best apn column for parcels'
    new_column_parcels = best_apn(parcels_sfr, parcels.apn_formatted,
                                  parcels.apn_unformatted)
    parcels_sfr.loc[:, parcels.
                    best_apn] = new_column_parcels  # generates an ignorable warning

    print 'adding best apn column for deeds'
    new_column_deeds = best_apn(deeds_g_al, deeds.apn_formatted,
                                deeds.apn_unformatted)
    deeds_g_al.loc[:, deeds.best_apn] = new_column_deeds

    ps('revised deeds_g_al', deeds_g_al)
    ps('revised parcels_sfr', parcels_sfr)

    # join the deeds and parcels files
    print 'starting to merge'
    check_feature_names(deeds_g_al)
    check_feature_names(parcels_sfr)
    m1 = deeds_g_al.merge(parcels_sfr,
                          how='inner',
                          left_on=deeds.best_apn,
                          right_on=parcels.best_apn,
                          suffixes=('_deed', '_parcel'))
    check_feature_names(m1)
    del deeds_g_al
    del parcels_sfr
    ps('m1 merge deed + parcels', m1)

    # add in derived parcels features
    m2 = parcels_derived_features(control, m1)
    check_feature_names(m2)
    ps('ms added parcels_derived', m2)
    del m1

    # add in census data
    census_features_df = read_census_features(control)
    m3 = m2.merge(
        census_features_df,
        left_on=transactions.census_tract,
        right_on="census_tract",
    )
    assert 'census_tract_x' in m3.columns
    assert 'census_tract_y' in m3.columns
    assert (m3.census_tract_x == m3.census_tract_y).all()
    assert 'census_tract' not in m3.columns
    pu.df_remove_column(m3, 'census_tract_x')
    pu.df_rename_column(m3, 'census_tract_y', 'census_tract')
    check_feature_names(m3)
    del m2
    ps('m3 merged census features', m3)

    # add in GPS coordinates
    geocoding_df = read_geocoding(control)
    m4 = m3.merge(
        geocoding_df,
        left_on="best_apn",
        right_on="G APN",
    )
    del geocoding_df
    del m3
    ps('m4 merged geocoding', m4)

    final = m4
    print 'final columns'
    for c in final.columns:
        print c,
    print

    cc('fraction',
       final)  # verify that fraction_owner_occupied is in the output
    print 'final shape', final.shape

    # write merged,augmented dataframe
    print 'writing final dataframe to csv file'
    final.to_csv(control.path_out_transactions)

    # write out all the column names
    print 'all column names in final dataframe'
    for name in final.columns:
        print name
        if '_y' in name:
            print 'found strange suffix'
            pdb.set_trace()

    print control
    if control.test:
        print 'DISCARD OUTPUT: test'
    print 'done'

    return
Example #4
0
def main(argv):
    control = make_control(argv)
    sys.stdout = Logger(base_name=control.arg.base_name)
    print control

    # NOTE: Organize the computation to minimize memory usage
    # so that this code can run on smaller-memory systems

    def ps(name, value):
        s = value.shape
        print '  %20s shape (%d, %d)' % (name, s[0], s[1])

    # create dataframes
    n_read_if_test = 10000
    deeds_g_al = deeds.read_g_al(
        control.path,
        n_read_if_test if control.test else None,
    )
    parcels_sfr = parcels.read(
        control.path,
        10000 if control.test else None,
        just_sfr=True,
    )
    ps('original deeds g al', deeds_g_al)
    ps('original parcels sfr', parcels_sfr)

    # augment parcels to include a zip5 field (5-digit zip code)
    # drop samples without a zipcode
    # rationale: we use the zip5 to join the features derived from parcels
    # and zip5 is derived from zipcode
    zipcode_present = parcels_sfr[parcels.zipcode].notnull()
    parcels_sfr = parcels_sfr[zipcode_present]
    parcels.add_zip5(parcels_sfr)

    # augment parcels and deeds to include a better APN
    print 'adding best apn column for parcels'
    new_column_parcels = best_apn(parcels_sfr, parcels.apn_formatted, parcels.apn_unformatted)
    parcels_sfr.loc[:, parcels.best_apn] = new_column_parcels  # generates an ignorable warning

    print 'adding best apn column for deeds'
    new_column_deeds = best_apn(deeds_g_al, deeds.apn_formatted, deeds.apn_unformatted)
    deeds_g_al.loc[:, deeds.best_apn] = new_column_deeds

    ps('revised deeds_g_al', deeds_g_al)
    ps('revised parcels_sfr', parcels_sfr)

    # join the deeds and parcels files
    print 'starting to merge'
    check_feature_names(deeds_g_al)
    check_feature_names(parcels_sfr)
    m1 = deeds_g_al.merge(parcels_sfr, how='inner',
                          left_on=deeds.best_apn, right_on=parcels.best_apn,
                          suffixes=('_deed', '_parcel'))
    check_feature_names(m1)
    del deeds_g_al
    del parcels_sfr
    ps('m1 merge deed + parcels', m1)

    # add in derived parcels features
    m2 = parcels_derived_features(control, m1)
    check_feature_names(m2)
    ps('ms added parcels_derived', m2)
    del m1

    # add in census data
    census_features_df = read_census_features(control)
    m3 = m2.merge(census_features_df,
                  left_on=transactions.census_tract,
                  right_on="census_tract",
                  )
    assert 'census_tract_x' in m3.columns
    assert 'census_tract_y' in m3.columns
    assert (m3.census_tract_x == m3.census_tract_y).all()
    assert 'census_tract' not in m3.columns
    pu.df_remove_column(m3, 'census_tract_x')
    pu.df_rename_column(m3, 'census_tract_y', 'census_tract')
    check_feature_names(m3)
    del m2
    ps('m3 merged census features', m3)

    # add in GPS coordinates
    geocoding_df = read_geocoding(control)
    m4 = m3.merge(geocoding_df,
                  left_on="best_apn",
                  right_on="G APN",
                  )
    del geocoding_df
    del m3
    ps('m4 merged geocoding', m4)

    final = m4
    print 'final columns'
    for c in final.columns:
        print c,
    print

    cc('fraction', final)  # verify that fraction_owner_occupied is in the output
    print 'final shape', final.shape

    # write merged,augmented dataframe
    print 'writing final dataframe to csv file'
    final.to_csv(control.path_out_transactions)

    # write out all the column names
    print 'all column names in final dataframe'
    for name in final.columns:
        print name
        if '_y' in name:
            print 'found strange suffix'
            pdb.set_trace()

    print control
    if control.test:
        print 'DISCARD OUTPUT: test'
    print 'done'

    return