SUBSTR(b.naics_code,1,2) as sector, SUM(a.visits) as total_visits FROM daily_visits a LEFT JOIN ( SELECT distinct safegraph_place_id, naics_code, top_category, sub_category FROM "safegraph"."core_poi" WHERE region = 'NY' AND dt = CAST('{2}' AS DATE) ) b ON a.safegraph_place_id=b.safegraph_place_id GROUP BY a.date_current, SUBSTR(a.poi_cbg,1,5), a.postal_code, SUBSTR(b.naics_code,1,2) ORDER BY date, borocode, zipcode """ # Load the current quarter quarters = get_quarter() # quarters = PastQs for year_qrtr, range in quarters.items(): start = range[0] end = range[1] print(year_qrtr, start, end) aws.execute_query( query=query.format(start, end, poi_latest_date), database="safegraph", output=f"output/dev/poi/daily_zip_poivisits_by_sector/daily_zip_poivisits_by_sector_{year_qrtr}.csv.zip" )
GROUP BY year_week, origin, destination ORDER BY year_week, origin, destination """ # Load the current quarter quarters = get_quarter() # quarters = PastQs for year_qrtr, range in quarters.items(): start = range[0] end = range[1] print(year_qrtr, start, end) aws.execute_query( query=query.format(start, end), database="safegraph", output= f"output/production/social_distancing/weekly_state_trips/weekly_state_trips_{year_qrtr}.csv.zip" ) # Add/update device count table for states and NYC query = """ SELECT SUBSTR(date_range_start, 1, 10) as date, CAST(EXTRACT(year_of_week from CAST(SUBSTR(date_range_start, 1, 10) AS DATE)) AS VARCHAR)||'-'|| LPAD(CAST(EXTRACT(week from CAST(SUBSTR(date_range_start, 1, 10) AS DATE)) AS VARCHAR),2,'0') as year_week, SUBSTR(origin_census_block_group, 1, 2) as state, SUM(CAST(device_count AS INTEGER)) as device_count, SUM(CAST(completely_home_device_count AS INTEGER)) as completely_home_device_count FROM social_distancing WHERE SUBSTR(origin_census_block_group, 1, 5) NOT IN ('36085','36081','36061','36047','36005') GROUP BY date_range_start, SUBSTR(origin_census_block_group, 1, 2)
query1 = """ SELECT SUBSTR(date_range_start, 1, 10) as date, CAST(EXTRACT(year from CAST(SUBSTR(date_range_start, 1, 10) AS DATE)) AS VARCHAR)||'-'|| LPAD(CAST(EXTRACT(week from CAST(SUBSTR(date_range_start, 1, 10) AS DATE)) AS VARCHAR),2,'0') as year_week, SUM(number_devices_residing) as number_devices_residing FROM home_panel_summary WHERE SUBSTR(census_block_group, 1, 5) IN ('36085','36081','36061','36047','36005') AND dt >= CAST('2019-01-01' as DATE) GROUP BY SUBSTR(date_range_start, 1, 10) ORDER BY SUBSTR(date_range_start, 1, 10) """ tablename1 = 'weekly_nyc_summary' aws.execute_query( query=query1, database="safegraph", output=f"output/home_panel_summary/{tablename1}/{tablename1}.csv.zip" ) query2 = """ SELECT SUBSTR(date_range_start, 1, 10) as date, CAST(EXTRACT(year from CAST(SUBSTR(date_range_start, 1, 10) AS DATE)) AS VARCHAR)||'-'|| LPAD(CAST(EXTRACT(week from CAST(SUBSTR(date_range_start, 1, 10) AS DATE)) AS VARCHAR),2,'0') as year_week, SUBSTR(census_block_group, 1, 5) as fips_county, (CASE WHEN SUBSTR(census_block_group, 1, 5) = '36005' THEN 'BX' WHEN SUBSTR(census_block_group, 1, 5) = '36047' THEN 'BK' WHEN SUBSTR(census_block_group, 1, 5) = '36061' THEN 'MN' WHEN SUBSTR(census_block_group, 1, 5) = '36081' THEN 'QN' WHEN SUBSTR(census_block_group, 1, 5) = '36085' THEN 'SI' END) as borough,
pull_month = cur_month - 1 # pull number of days in a given month _, numdays = monthrange(pull_year, pull_month) # create a string yyyy-mm-01 for the beginning of the month start = '-'.join([str(pull_year), str(pull_month).zfill(2), '01']) # create a string yyyy-mm-dd for the end of the month end = '-'.join([str(pull_year), str(pull_month).zfill(2), str(numdays)]) # create a string yyyy-mm data_month = '_'.join([str(pull_year), str(pull_month).zfill(2)]) print(f"pulling data for {start} through {end}") aws.execute_query( query=query.format(start, end), database="safegraph", output=f"output/dev/wp_NYS/weekly_patterns_NYS_{data_month}.csv.zip") ### HISTORICAL WEEKLY PATTERNS FOR NYS # next_month = '-'.join([str(cur_year),str(cur_month+1).zfill(2),'01']) # print(next_month) # # date range # years = list(range(2018,2023)) # months = list(range(1,13)) # for y in years: # for m in months: # # pull number of days in a given month # _, numdays = monthrange(y,m)
import boto3 from pathlib import Path #get dataframe from Athena query = ''' SELECT * FROM neighborhood_patterns_202107 WHERE date_range_start = (SELECT MAX(date_range_start) FROM neighborhood_patterns_202107 as np2 ); ''' output_csv_path =f"output/dev/ops/neighborhood_patterns_US_latest.csv.zip" aws.execute_query( query=query, database="safegraph", output=output_csv_path ) #run query on it and get CSV s3 = boto3.resource('s3') cwd = os.getcwd() s3_obj = s3.Bucket('recovery-data-partnership').Object('output/dev/ops') s3.Bucket('recovery-data-partnership').download_file('output/dev/ops/neighborhood_patterns_US_latest.csv.zip', str(Path(cwd) / "neighborhood_patterns_US_latest.csv.zip")) #s3.Bucket('recovery-data-partnership').download_file('output/dev/ops/poi_info/poi_info.csv.zip',str(Path(cwd) / "foo.zip")) ###### taken from query folder ##### df = pd.read_csv(Path(cwd) / "neighborhood_patterns_US_latest.csv.zip") os.remove(Path(cwd) / "neighborhood_patterns_US_latest.csv.zip") import os
def my_main(split_chunk): s3 = boto3.resource('s3') cwd = os.getcwd() s3_obj = s3.Bucket('recovery-data-partnership').Object('output/dev/parks') #print(f"split chunk of rows: {split_chunk_of_rows.info()}") for my_date in split_chunk: latest_date = my_date warnings.warn('for date: "{}"'.format(latest_date)) query = ''' SELECT * FROM weekly_patterns_202107 WHERE substr(date_range_start, 1, 10) = '{}' AND substr(poi_cbg, 1, 5) in ('36005', '36047', '36061', '36081', '36085'); '''.format(latest_date) output_csv_path_2 = 'output/dev/parks/nyc-weekly-patterns/nyc_weekly_patterns_{}.csv.zip'.format(latest_date) if is_prod: aws.execute_query( query=query, database="safegraph", output=output_csv_path_2 ) s3.Bucket('recovery-data-partnership').download_file('output/dev/parks/nyc-weekly-patterns/nyc_weekly_patterns_{}.csv.zip'.format(latest_date), str(Path(cwd) / f'nyc_weekly_patterns_temp_{latest_date}.csv.zip')) ##### get multiplier ##### query = ''' SELECT (substr(hps.date_range_start, 1, 10)) as date_range_start, hps.census_block_group as cbg, hps.number_devices_residing as devices_residing, census.b01001e1 as cbg_pop, census.b01001e1 / (CASE WHEN(hps.number_devices_residing = 0) THEN 1 ELSE hps.number_devices_residing END)as pop_multiplier FROM hps_crawled22 AS hps INNER JOIN census on hps.census_block_group = census.census_block_group WHERE substr(hps.date_range_start, 1, 10) = '{}' UNION SELECT (substr(hps2.date_range_start, 1, 10)) as date_range_start, hps2.census_block_group as cbg, hps2.number_devices_residing as devices_residing, census.b01001e1 as cbg_pop, census.b01001e1 / (CASE WHEN(hps2.number_devices_residing = 0) THEN 1 ELSE hps2.number_devices_residing END) as pop_multiplier FROM hps_crawledhome_panel_summary_202107 AS hps2 INNER JOIN census on hps2.census_block_group = census.census_block_group WHERE substr(hps2.date_range_start, 1, 10) = '{}'; '''.format(latest_date, latest_date) #we want to include the entire census for multipliers (out of state visitors) #AND substr(hps.census_block_group, 1, 5) IN ('36005', '36047', '36061', '36081', '36085') output_csv_path = f'output/dev/parks/safegraph-with-population/multipliers/pop_to_device_multiplier_{latest_date}.csv.zip' if is_prod: aws.execute_query( query=query, database="safegraph", output=output_csv_path ) s3.Bucket('recovery-data-partnership').download_file(f'output/dev/parks/safegraph-with-population/multipliers/pop_to_device_multiplier_{latest_date}.csv.zip', str(Path(cwd) / f'multiplier_temp_{latest_date}.csv.zip')) df_mult = pd.read_csv(Path(cwd) / f'multiplier_temp_{latest_date}.csv.zip', dtype={'cbg': object}) is_in_nyc = [True if row[:5] in ['36005', '36047', '36061', '36081', '36085'] else False for row in df_mult['cbg']] df_mult_nyc = df_mult[is_in_nyc] df_mult_nyc.reset_index() sum_pop = df_mult_nyc['cbg_pop'].sum() sum_dc = df_mult_nyc['devices_residing'].sum() macro_multiplier = sum_pop / (sum_dc * 1.0) ##### join census to cbg to weekly patterns and multiply ##### df = pd.read_csv(Path(cwd) / f'nyc_weekly_patterns_temp_{latest_date}.csv.zip' ) # for each row in the dataframe if ((len(df) == 0) or (len(df_mult) == 0)) : warnings.warn(f"{latest_date} Either the home panel summary or the weekly patterns were not found") continue multiplier_list = [] sys.stdout.flush() sys.stderr.flush() if is_prod: for index, row in df.iterrows(): iter = 0 pop_count = 0.0 #no_match_count isn't being used in prod. no_match_count = 0 no_match_count_rows = 0 sum_cbg_pop = 0 sum_cbg_devices = 0 this_json = json.loads(row['visitor_home_cbgs']) #for each item in the dictionary for key, value in this_json.items(): #multiply devices by people per device table iter = iter + 1 selected_rows = df_mult.iloc[:, df_mult.columns.get_loc('cbg')] == key #filter df_mult selected_rows_mult_df = df_mult[selected_rows] #isolate multiplier try: #take the first row. should only be one match. if len(selected_rows_mult_df[selected_rows_mult_df['cbg'] == key]) > 1: warning_message = "more than one match for key {}".format(key) warnings.warn(warning_message) #multiplier = selected_rows_mult_df.iloc[0, selected_rows_mult_df.columns.get_loc('pop_multiplier')] cbg_pop = selected_rows_mult_df.iloc[0, selected_rows_mult_df.columns.get_loc('cbg_pop')] # need to multiply by the number of visitors to get a weighted average cbg_pop = cbg_pop * value devices_residing = selected_rows_mult_df.iloc[0, selected_rows_mult_df.columns.get_loc('devices_residing')] #and here devices_residing = devices_residing * value except IndexError: warning_message = 'warning: there is no row zero for key {}'.format(key) #warnings.warn(warning_message) #no_match_count isn't being used no_match_count = no_match_count + value no_match_count_rows = no_match_count_rows + 1 #if no multiplier, pop_count stays the same. Added back after the loop. #multiplier = 0 cbg_pop = 0 devices_residing = 0 #this is done below. see synthtetic mult. #pop_calc = pop_count + multiplier * value * 1.0 sum_cbg_pop = sum_cbg_pop + cbg_pop sum_cbg_devices = sum_cbg_devices + devices_residing sys.stdout.flush() sys.stderr.flush() #to fill in the missing values (i.e. Canada) take the average population of the other cbgs no_zero_lambda_func = (lambda x : x if x > 0 else 1) synthetic_mult = sum_cbg_pop / no_zero_lambda_func(sum_cbg_devices * 1.0) #if mutliplier is zero (for small visit counts without home block groups) sys.stdout.flush() sys.stderr.flush() #visitors_pop_count = row['raw_visitor_counts'] * synthetic_mult #visits_pop_count = row['raw_visit_counts'] * synthetic_mult #add value to list #visitors_pop_list.append(visitors_pop_count) #visits_pop_list.append(visits_pop_count) multiplier_list.append(synthetic_mult) #print("final pop count: {}".format(pop_count)) non_zero_multipliers = [x for x in multiplier_list if x > 0] #print(f"non zero multipliers: {non_zero_multipliers}") #take the average of all the multipliers. avg_multiplier = np.mean(non_zero_multipliers) #fill multipliers with imputed multiplier. multiplier_list = [x if x > 0 else macro_multiplier for x in multiplier_list] print(f"avg_multiplier: {avg_multiplier}, macro_multiplier:{macro_multiplier}") #convert device counts to population counts based on multiplier series. df['pop_multiplier'] = multiplier_list df['visits_pop_calc'] = multiplier_list * df['raw_visit_counts'] df['visitors_pop_calc'] = multiplier_list * df['raw_visitor_counts'] #it is a bit more complex to multiply a list df['visits_by_day_pop_calc'] = None df['visits_by_hour_pop_calc'] = None df_copy = df.copy() def multiply_list(by_day_list, multiplier_list): visits_by_day = [float(x) * 1.0 for x in literal_eval(by_day_list)] this_multiplier = multiplier_list[index] return list(np.multiply(visits_by_day, (np.repeat(this_multiplier * 1.0, len(visits_by_day))))) for index, row in df_copy.iterrows(): df.at[index, 'visits_by_day_pop_calc' ] = multiply_list(row['visits_by_day'], multiplier_list) df.at[index, 'visits_by_hour_pop_calc'] = multiply_list(row['visits_by_each_hour'], multiplier_list) #garbage collection df_copy = None #df['visits_by_day_pop_calc'] = multiplier_list * df['visits_by_day'] warnings.warn(str(df.head(5))) sys.stdout.flush() sys.stderr.flush() print(df.info()) df.to_csv(Path(cwd) /f'poi_weekly_pop_added_{latest_date}.csv') s3.Bucket('recovery-data-partnership').upload_file(str(Path(cwd) / f'poi_weekly_pop_added_{latest_date}.csv'), f"output/dev/parks/safegraph-with-population/poi_with_population_count_{latest_date}.csv") sys.stdout.flush() sys.stderr.flush() df_ans = pd.read_csv(f'poi_weekly_pop_added_{latest_date}.csv') #print(df_ans.info()) #print(df_ans.head(20)) #Extract parks data parks_poi_df = pd.read_csv('nyc_parks_pois_keys_082021.csv') df_ans['placekey'] = df_ans['placekey'].astype(str) parks_poi_df['placekey'] = parks_poi_df['placekey'].astype(str) #df_parks = df_ans.join(parks_poi_df, on='placekey', how='right') df_parks = pd.merge(parks_poi_df, df_ans, how='left', on='placekey') print(df_parks.head(6)) print('saving parks slice csv') df_parks.to_csv(f'parks_slice_poi_{latest_date}.csv') s3.Bucket('recovery-data-partnership').upload_file(str(Path(cwd) / f'parks_slice_poi_{latest_date}.csv'), f"output/dev/parks/parks-slice-poi/parks_slice_poi_{latest_date}.csv") if is_prod: #uncomment in production try: os.remove(Path(cwd) / f'parks_slice_poi_{latest_date}.csv') os.remove(Path(cwd) / f'nyc_weekly_patterns_temp_{latest_date}.csv.zip') os.remove(Path(cwd) / f'multiplier_temp_{latest_date}.csv.zip') os.remove(Path(cwd) / f'poi_weekly_pop_added_{latest_date}.csv') except FileNotFoundError: print("file not found to remove") except PermissionError: print("couldn't remove because file is in use.") print("{} Successfully completed at {}".format(latest_date, datetime.now())) sys.stdout.flush() sys.stderr.flush()
ON ST_WITHIN(ST_POINT(a.longitude, a.latitude), ST_POLYGON(b.wkt)) """ # Load the current quarter quarters = get_quarter() # quarters = PastQs tablename1 = 'weekly_nyc_poivisits' tablename2 = 'weekly_nyc_poivisits_by_visitor_home_cbg' tablename3 = 'poi_info' for year_qrtr, range in quarters.items(): start = range[0] end = range[1] print(year_qrtr, start, end) # weekly_nyc_poivisits aws.execute_query( query=query1.format(start, end), database="safegraph", output=f"output/dev/ops/{tablename1}/{tablename1}_{year_qrtr}.csv.zip") # weekly_nyc_poivisits_by_visitor_home_cbg aws.execute_query( query=query2.format(start, end), database="safegraph", output=f"output/dev/ops/{tablename2}/{tablename2}_{year_qrtr}.csv.zip") # poi_info aws.execute_query(query=query3.format(poi_latest_date, geo_latest_date), database="safegraph", output=f"output/dev/ops/{tablename3}/{tablename3}.csv.zip")
SELECT distinct safegraph_place_id, naics_code, street_address, latitude, longitude FROM "safegraph"."core_poi" WHERE region = 'NY' AND dt = CAST('{2}' AS DATE) ) b ON a.safegraph_place_id=b.safegraph_place_id GROUP BY EXTRACT(year_of_week from a.date_current), EXTRACT(week from a.date_current), a.location_name, b.naics_code, a.poi_cbg, b.street_address, b.latitude, b.longitude ORDER BY year_week, poi_cbg """ # Load the current quarter quarters = get_quarter() # quarters = PastQs for year_qrtr, range in quarters.items(): start = range[0] end = range[1] print(year_qrtr, start, end) aws.execute_query( query=query.format(start, end, poi_latest_date), database="safegraph", output= f"output/dev/poi/weekly_nyc_poivisits/weekly_nyc_poivisits_{year_qrtr}.csv.zip" )