Exemple #1
0
def clean_data(taxi_2013):
    """
    Clean data and merge on census summary statistics.
    """
    with open('data/taxi/yellow_3_2013.pkl') as f:
        taxi_2013 = pickle.load(f)

    with open('data/census_lookup_table.pkl') as f:
        census_lookup = pickle.load(f)

    taxi_geo = taxi_2013.merge(census_lookup, left_on=['pick_lat',
                                                       'pick_long'],
                               right_on=['lat3', 'lon3'], how='left')
    taxi_geo['Id2'] = taxi_geo.GEOID.apply(lambda x: str(x)[0:12])

    census_dem = cg.make_census_data(['B01002', 'B02001', 'B19301'])
    census_dem = cg.clean_census(census_dem)

    taxi_dem = taxi_geo.merge(census_dem, on=['Id2'], how='left')
    taxi_dem = taxi_dem[['pick_lat', 'pick_long', 'drop_lat', 'drop_long',
                         'cnt_2013', 'Estimate; Median age -- - Total:',
                         'Estimate; Per capita income in the past 12 months (in 2013 inflation-adjusted dollars)',
                         'perc_white', 'perc_black', 'perc_asian',
                         'perc_other']]
    taxi_dem = taxi_dem.dropna()

    taxi_dem['tot_white'] = taxi_dem['perc_white'] * taxi_dem['cnt_2013']
    taxi_dem['tot_black'] = taxi_dem['perc_black'] * taxi_dem['cnt_2013']
    taxi_dem['tot_asian'] = taxi_dem['perc_asian'] * taxi_dem['cnt_2013']
    taxi_dem['tot_other'] = taxi_dem['perc_other'] * taxi_dem['cnt_2013']
    taxi_dem['tot_age'] = taxi_dem['Estimate; Median age -- - Total:'] * taxi_dem['cnt_2013']
    taxi_dem['tot_inc'] = taxi_dem['Estimate; Per capita income in the past 12 months (in 2013 inflation-adjusted dollars)'] * taxi_dem['cnt_2013']

    taxi_dem2 = taxi_dem.groupby(['drop_lat',
                                  'drop_long']).sum()[['cnt_2013', 'tot_white',
                                                       'tot_black',
                                                       'tot_asian',
                                                       'tot_other', 'tot_age',
                                                       'tot_inc']]

    taxi_dem2['p_white'] = taxi_dem2['tot_white']/taxi_dem2['cnt_2013']
    taxi_dem2['p_black'] = taxi_dem2['tot_black']/taxi_dem2['cnt_2013']
    taxi_dem2['p_asian'] = taxi_dem2['tot_asian']/taxi_dem2['cnt_2013']
    taxi_dem2['p_other'] = taxi_dem2['tot_other']/taxi_dem2['cnt_2013']
    taxi_dem2['avg_age'] = taxi_dem2['tot_age']/taxi_dem2['cnt_2013']
    taxi_dem2['avg_inc'] = taxi_dem2['tot_inc']/taxi_dem2['cnt_2013']
    taxi_dem2 = taxi_dem2[['p_white', 'p_black', 'p_asian', 'p_other',
                           'avg_age', 'avg_inc']]

    with open('data/taxi_dems.pkl', 'w') as f:
        pickle.dump(taxi_dem2, f)
Exemple #2
0
def main():
    """
    Takes raw data and returns metadata dataframe and reviews dataframe for
    restaurants open at least 200 days.
    """
    create_master('data/cleaned', 'data/cleaned_yelp_combined.txt')
    all_reviews = import_clean('data/cleaned_yelp_combined.txt')

    # Add Bayes
    all_reviews = reviews_cum(all_reviews)
    all_reviews = make_bayes(all_reviews)

    rest_sum = group_sum(all_reviews)
    day_status = days_from(rest_sum, [30, 60, 90, 200])

    del rest_sum['first_review_dt']
    del rest_sum['last_review_dt']

    nyc_meta = YelpDatabase(database_name="yelp_nyc3",
                            cat_filt="restaurants").get_full_df()

    nyc_meta_status = rest_detailed_sum(nyc_meta, day_status).reset_index()

    # Limit dataset to days with reviews and that have been opened > 200 days
    nyc_meta_status = nyc_meta_status[nyc_meta_status.first_review_dt.notnull()]
    nyc_meta_status = limit_open_min(nyc_meta_status, 200).reset_index()

    # Geocode and add census
    census_data = cg.make_census_data(['B01002', 'B02001', 'B19301'])
    census_data = cg.clean_census(census_data)

    nyc_meta_status = add_census(nyc_meta_status,
                             'data/census_lookup_table.pkl', census_data)

    # Add Taxi
    with open("data/taxi_dems.pkl") as f:
        taxi_dems = pickle.load(f)

    nyc_meta_status = nyc_meta_status.merge(taxi_dems, how='left',
                                            left_on=['lat3', 'lon3'],
                                            right_index=True)

    return nyc_meta_status, all_reviews