def read_dilation_index(self, start_date, end_date): df = pd.read_csv(config.di_csv_path, usecols=['FIPS', 'Date', 'DI'], dtype={'FIPS': str}) d = start_date output_dfs = [] while d < end_date: output_dfs.append(df[df['Date'] == str(d)].set_index('FIPS').drop( columns=['Date'])) d += timedelta(1) return TimeDependentFeatures(output_dfs, 'di', start_date, timedelta(days=1), feature_saver=saver_config.dilation_index)
def read_weather_data(self, start_date, end_date): county_dfs = [] # load county data and make df with relevant data for the county counties = features_config.county_info attributes = '&datatypeid='.join(config.weather_attributes) for county in counties.index: result = requests.get( "https://www.ncdc.noaa.gov/cdo-web/api/v2/data?" "datasetid=GHCND&locationid=FIPS:{}&startdate={}&enddate={}&limit=1000" "&datatypeid={}".format(str(county), str(start_date), str(end_date), attributes), headers={"token": config.weather_token}) if result.status_code != 200: logging.error( "Unable to connect and retrieve data from NOAA. Status code:", result.status_code) continue result_json = result.json() if result_json: logging.info(f'Received data for county {county}') df = pd.io.json.json_normalize(result_json, 'results') df = df[df['datatype'].isin(config.weather_attributes)] df['date'] = df['date'].str[:10] df = df.groupby(['date', 'datatype']).agg({ 'value': 'mean' }).reset_index() df['FIPS'] = county county_dfs.append(df) # join all county data county_dfs = pd.concat(county_dfs, ignore_index=True) # filter dfs by day dfs_per_day = [] dates = county_dfs['date'].drop_duplicates().sort_values() for d in dates: dfs_per_day.append(county_dfs[county_dfs['date'] == d].pivot( index='FIPS', columns='datatype', values='value')) return TimeDependentFeatures(dfs_per_day, 'weather_data', start_date, timedelta(1), feature_saver=saver_config.weather)
def read_sg_social_distancing(self, start_date, end_date): output_dfs = [] files = config.sg_social_distancing_reader.get_files_between( start_date, end_date) for csv_file, cur_date, _ in files: df = pd.read_csv( csv_file, usecols=[ 'origin_census_block_group', 'date_range_start', 'date_range_end', 'device_count', 'distance_traveled_from_home', 'completely_home_device_count', 'median_home_dwell_time', 'part_time_work_behavior_devices', 'full_time_work_behavior_devices' ], dtype={ 'origin_census_block_group': str }, ).set_index('origin_census_block_group') logging.info(f'Successfully read {csv_file}') # prepare for weighted average df['distance_traveled_from_home'] *= df['device_count'] df['median_home_dwell_time'] *= df['device_count'] df = df.groupby(lambda cbg: cbg[:5]).sum() logging.info('Grouped by counties') df['completely_home_device_count'] /= df['device_count'] df['part_time_work_behavior_devices'] /= df['device_count'] df['full_time_work_behavior_devices'] /= df['device_count'] df['distance_traveled_from_home'] /= df['device_count'] df['median_home_dwell_time'] /= df['device_count'] df = df.drop(['device_count'], axis=1) output_dfs.append(df.dropna()) return \ TimeDependentFeatures(output_dfs, 'sg_social_distancing', start_date, timedelta(days=1), feature_saver=saver_config.sg_social_distancing)
def read_jsi_OxCGRT(self, start_date, end_date): output_dfs = [] files = config.jsi_OxCGRT_reader.get_files_between( start_date, end_date) for csv_file, cur_date, _ in files: df = pd.read_csv( csv_file, usecols=[ 'CountryName', 'CountryCode', 'RegionName', 'RegionCode', 'Date', 'C1_School closing', 'C2_Workplace closing', 'C3_Cancel public events', 'C4_Restrictions on gatherings', 'C5_Close public transport', 'C6_Stay at home requirements', 'C7_Restrictions on internal movement', 'C8_International travel controls', 'E1_Income support', 'E2_Debt/contract relief', 'E3_Fiscal measures', 'E4_International support', 'H1_Public information campaigns', 'H2_Testing policy', 'H3_Contact tracing', 'H4_Emergency investment in healthcare', 'H5_Investment in vaccines', 'H6_Facial Coverings' ], index_col='CountryName') # Create new index col df['FIPS'] = df['CountryCode'] + "_" + df['RegionCode'].astype(str) df = df.set_index('FIPS') # Use lowercase date so we ca df.rename(columns={'Date': 'date'}, inplace=True) df = df.where(df.notnull(), None) output_dfs.append(df) return TimeDependentFeatures(output_dfs, 'jsi-OxCGRT', start_date, timedelta(days=1), feature_saver=saver_config.jsi_OxCGRT)
def read_num_cases(self, start_date, end_date): df = pd.read_csv(config.labels_csv_path, usecols=['date', 'fips', 'cases'], dtype={ 'fips': str }).dropna().set_index('fips') output_dfs = [] interval = timedelta(hyperparams.projection_days) cur_date = start_date while cur_date < end_date: df_old = df[df['date'] == str(cur_date - interval)] df_new = df[df['date'] == str(cur_date)] cur_df = df_old.merge(df_new, how='right', left_index=True, right_index=True, suffixes=('_start', '_end')) cur_df['new_cases'] = cur_df['cases_end'].subtract( cur_df['cases_start'], fill_value=0) cur_df.drop(['cases_end', 'cases_start', 'date_end', 'date_start'], axis=1, inplace=True) cur_df = cur_df[cur_df['new_cases'] >= 0] # negatives are errors output_dfs.append(cur_df) logging.info('Processed num cases for ' + str(cur_date)) cur_date += timedelta(days=1) return TimeDependentFeatures(output_dfs, 'new_cases', start_date, timedelta(days=1), feature_saver=saver_config.num_cases)
def read_reproduction_number(self, start_date, end_date): df = pd.read_csv(config.ri_csv_path, dtype={'fips': str}) df = df.drop( columns=['state', 'county']).set_index('fips').sort_index() output_dfs = [] d = start_date while d < end_date: if str(d) in df.columns: output_dfs.append(df[str(d)].to_frame().rename(columns={ str(d): 'rn' }).fillna(0)) else: output_dfs.append( pd.DataFrame(index=df.index, columns=['rn']).fillna(0)) d += timedelta(1) return TimeDependentFeatures( output_dfs, 'reproduction_index', start_date, timedelta(days=1), feature_saver=saver_config.reproduction_number)
def read_sg_patterns_monthly(self, start_date, end_date): files = config.sg_patterns_monthly_reader.get_files_between( start_date, end_date) main_df = pd.DataFrame() logging.info('Reading Safegraph Patterns Monthly Data') for csv_file, month_start, month_end in files: index_start = month_start.day - 1 index_end = (month_end - timedelta(1)).day df = pd.read_csv( csv_file, usecols=[ 'safegraph_place_id', 'visits_by_day' # bucketed_dwell_times may be useful to see # how long people stayed ], converters={ 'visits_by_day': (lambda x: np.array([ int(s) for s in re.split(r'[,\s]\s*', x.strip('[]')) ])[index_start:index_end]) }) logging.info(f'Successfully read {csv_file}...') # decompose visits by day into different columns decomposed_visits_df = pd.DataFrame( df['visits_by_day'].values.tolist(), columns=self._get_names_starting_with(start_date, month_start, month_end, 'visits_day_')) for c in decomposed_visits_df.columns: df[c] = decomposed_visits_df[c] df = df.drop(['visits_by_day'], axis=1) logging.info('Decomposed visits per day') # find FIPS and category of poi df['countyFIPS'] = df['safegraph_place_id'].apply( lambda x: self.poi_info[x]['countyFIPS'] if x in self.poi_info and self.poi_info[x]['countyFIPS'] else '00000') df['top_category'] = df['safegraph_place_id'].apply( lambda x: self.poi_info[x]['top_category'] if x in self. poi_info and self.poi_info[x]['top_category'] else 'Unknown') logging.info('Finished getting categories') top_cats = set() for k in self.poi_info: if isinstance(self.poi_info[k]['top_category'], str): top_cats.add(self.poi_info[k]['top_category']) for cat in top_cats: colname = cat.translate( str.maketrans('', '', string.punctuation)).lower().replace( ' ', '_') for suffix in self._get_names_starting_with( start_date, month_start, month_end, '_visits_day_'): df[colname + suffix] = \ df[suffix[1:]] * (df['top_category'] == cat) logging.info('Finished creating category columns') df = df.groupby('countyFIPS').sum() logging.info('Finished grouping by FIPS code') common_cols = main_df.columns.intersection(df.columns) main_df = df.merge(main_df, how='outer', suffixes=('_l', '_r'), left_index=True, right_index=True) cols_to_remove = [] for c in common_cols: main_df[c] = main_df[c + '_l'].add(main_df[c + '_r'], fill_value=0) cols_to_remove.append(c + '_l') cols_to_remove.append(c + '_r') main_df.drop(cols_to_remove, axis=1, inplace=True) logging.info('Finished merging columns') output_dfs = [] for col_suffix in self._get_names_starting_with( start_date, start_date, end_date, 'day_'): cols = [c for c in main_df.columns if c.endswith(col_suffix)] renamed_cols = {} for c in cols: renamed_cols[c] = c[:-len(col_suffix)] output_dfs.append(main_df[cols].rename(columns=renamed_cols)) return \ TimeDependentFeatures(output_dfs, 'sg_patterns_monthly', start_date, timedelta(days=1), feature_saver=saver_config.sg_patterns_monthly)