def clean_data(taxi_2013): """ Clean data and merge on census summary statistics. """ with open('data/taxi/yellow_3_2013.pkl') as f: taxi_2013 = pickle.load(f) with open('data/census_lookup_table.pkl') as f: census_lookup = pickle.load(f) taxi_geo = taxi_2013.merge(census_lookup, left_on=['pick_lat', 'pick_long'], right_on=['lat3', 'lon3'], how='left') taxi_geo['Id2'] = taxi_geo.GEOID.apply(lambda x: str(x)[0:12]) census_dem = cg.make_census_data(['B01002', 'B02001', 'B19301']) census_dem = cg.clean_census(census_dem) taxi_dem = taxi_geo.merge(census_dem, on=['Id2'], how='left') taxi_dem = taxi_dem[['pick_lat', 'pick_long', 'drop_lat', 'drop_long', 'cnt_2013', 'Estimate; Median age -- - Total:', 'Estimate; Per capita income in the past 12 months (in 2013 inflation-adjusted dollars)', 'perc_white', 'perc_black', 'perc_asian', 'perc_other']] taxi_dem = taxi_dem.dropna() taxi_dem['tot_white'] = taxi_dem['perc_white'] * taxi_dem['cnt_2013'] taxi_dem['tot_black'] = taxi_dem['perc_black'] * taxi_dem['cnt_2013'] taxi_dem['tot_asian'] = taxi_dem['perc_asian'] * taxi_dem['cnt_2013'] taxi_dem['tot_other'] = taxi_dem['perc_other'] * taxi_dem['cnt_2013'] taxi_dem['tot_age'] = taxi_dem['Estimate; Median age -- - Total:'] * taxi_dem['cnt_2013'] taxi_dem['tot_inc'] = taxi_dem['Estimate; Per capita income in the past 12 months (in 2013 inflation-adjusted dollars)'] * taxi_dem['cnt_2013'] taxi_dem2 = taxi_dem.groupby(['drop_lat', 'drop_long']).sum()[['cnt_2013', 'tot_white', 'tot_black', 'tot_asian', 'tot_other', 'tot_age', 'tot_inc']] taxi_dem2['p_white'] = taxi_dem2['tot_white']/taxi_dem2['cnt_2013'] taxi_dem2['p_black'] = taxi_dem2['tot_black']/taxi_dem2['cnt_2013'] taxi_dem2['p_asian'] = taxi_dem2['tot_asian']/taxi_dem2['cnt_2013'] taxi_dem2['p_other'] = taxi_dem2['tot_other']/taxi_dem2['cnt_2013'] taxi_dem2['avg_age'] = taxi_dem2['tot_age']/taxi_dem2['cnt_2013'] taxi_dem2['avg_inc'] = taxi_dem2['tot_inc']/taxi_dem2['cnt_2013'] taxi_dem2 = taxi_dem2[['p_white', 'p_black', 'p_asian', 'p_other', 'avg_age', 'avg_inc']] with open('data/taxi_dems.pkl', 'w') as f: pickle.dump(taxi_dem2, f)
def main(): """ Takes raw data and returns metadata dataframe and reviews dataframe for restaurants open at least 200 days. """ create_master('data/cleaned', 'data/cleaned_yelp_combined.txt') all_reviews = import_clean('data/cleaned_yelp_combined.txt') # Add Bayes all_reviews = reviews_cum(all_reviews) all_reviews = make_bayes(all_reviews) rest_sum = group_sum(all_reviews) day_status = days_from(rest_sum, [30, 60, 90, 200]) del rest_sum['first_review_dt'] del rest_sum['last_review_dt'] nyc_meta = YelpDatabase(database_name="yelp_nyc3", cat_filt="restaurants").get_full_df() nyc_meta_status = rest_detailed_sum(nyc_meta, day_status).reset_index() # Limit dataset to days with reviews and that have been opened > 200 days nyc_meta_status = nyc_meta_status[nyc_meta_status.first_review_dt.notnull()] nyc_meta_status = limit_open_min(nyc_meta_status, 200).reset_index() # Geocode and add census census_data = cg.make_census_data(['B01002', 'B02001', 'B19301']) census_data = cg.clean_census(census_data) nyc_meta_status = add_census(nyc_meta_status, 'data/census_lookup_table.pkl', census_data) # Add Taxi with open("data/taxi_dems.pkl") as f: taxi_dems = pickle.load(f) nyc_meta_status = nyc_meta_status.merge(taxi_dems, how='left', left_on=['lat3', 'lon3'], right_index=True) return nyc_meta_status, all_reviews