def read_dilation_index(self, start_date, end_date):
        df = pd.read_csv(config.di_csv_path,
                         usecols=['FIPS', 'Date', 'DI'],
                         dtype={'FIPS': str})
        d = start_date
        output_dfs = []
        while d < end_date:
            output_dfs.append(df[df['Date'] == str(d)].set_index('FIPS').drop(
                columns=['Date']))
            d += timedelta(1)

        return TimeDependentFeatures(output_dfs,
                                     'di',
                                     start_date,
                                     timedelta(days=1),
                                     feature_saver=saver_config.dilation_index)
    def read_weather_data(self, start_date, end_date):
        county_dfs = []
        # load county data and make df with relevant data for the county
        counties = features_config.county_info
        attributes = '&datatypeid='.join(config.weather_attributes)
        for county in counties.index:
            result = requests.get(
                "https://www.ncdc.noaa.gov/cdo-web/api/v2/data?"
                "datasetid=GHCND&locationid=FIPS:{}&startdate={}&enddate={}&limit=1000"
                "&datatypeid={}".format(str(county), str(start_date),
                                        str(end_date), attributes),
                headers={"token": config.weather_token})

            if result.status_code != 200:
                logging.error(
                    "Unable to connect and retrieve data from NOAA. Status code:",
                    result.status_code)
                continue

            result_json = result.json()
            if result_json:
                logging.info(f'Received data for county {county}')

                df = pd.io.json.json_normalize(result_json, 'results')
                df = df[df['datatype'].isin(config.weather_attributes)]
                df['date'] = df['date'].str[:10]
                df = df.groupby(['date', 'datatype']).agg({
                    'value': 'mean'
                }).reset_index()
                df['FIPS'] = county
                county_dfs.append(df)

        # join all county data
        county_dfs = pd.concat(county_dfs, ignore_index=True)

        # filter dfs by day
        dfs_per_day = []
        dates = county_dfs['date'].drop_duplicates().sort_values()
        for d in dates:
            dfs_per_day.append(county_dfs[county_dfs['date'] == d].pivot(
                index='FIPS', columns='datatype', values='value'))

        return TimeDependentFeatures(dfs_per_day,
                                     'weather_data',
                                     start_date,
                                     timedelta(1),
                                     feature_saver=saver_config.weather)
    def read_sg_social_distancing(self, start_date, end_date):
        output_dfs = []

        files = config.sg_social_distancing_reader.get_files_between(
            start_date, end_date)

        for csv_file, cur_date, _ in files:
            df = pd.read_csv(
                csv_file,
                usecols=[
                    'origin_census_block_group', 'date_range_start',
                    'date_range_end', 'device_count',
                    'distance_traveled_from_home',
                    'completely_home_device_count', 'median_home_dwell_time',
                    'part_time_work_behavior_devices',
                    'full_time_work_behavior_devices'
                ],
                dtype={
                    'origin_census_block_group': str
                },
            ).set_index('origin_census_block_group')

            logging.info(f'Successfully read {csv_file}')

            # prepare for weighted average
            df['distance_traveled_from_home'] *= df['device_count']
            df['median_home_dwell_time'] *= df['device_count']

            df = df.groupby(lambda cbg: cbg[:5]).sum()
            logging.info('Grouped by counties')

            df['completely_home_device_count'] /= df['device_count']
            df['part_time_work_behavior_devices'] /= df['device_count']
            df['full_time_work_behavior_devices'] /= df['device_count']
            df['distance_traveled_from_home'] /= df['device_count']
            df['median_home_dwell_time'] /= df['device_count']

            df = df.drop(['device_count'], axis=1)

            output_dfs.append(df.dropna())

        return \
            TimeDependentFeatures(output_dfs, 'sg_social_distancing',
                                  start_date, timedelta(days=1),
                                  feature_saver=saver_config.sg_social_distancing)
Ejemplo n.º 4
0
    def read_jsi_OxCGRT(self, start_date, end_date):
        output_dfs = []

        files = config.jsi_OxCGRT_reader.get_files_between(
            start_date, end_date)

        for csv_file, cur_date, _ in files:
            df = pd.read_csv(
                csv_file,
                usecols=[
                    'CountryName', 'CountryCode', 'RegionName', 'RegionCode',
                    'Date', 'C1_School closing', 'C2_Workplace closing',
                    'C3_Cancel public events', 'C4_Restrictions on gatherings',
                    'C5_Close public transport',
                    'C6_Stay at home requirements',
                    'C7_Restrictions on internal movement',
                    'C8_International travel controls', 'E1_Income support',
                    'E2_Debt/contract relief', 'E3_Fiscal measures',
                    'E4_International support',
                    'H1_Public information campaigns', 'H2_Testing policy',
                    'H3_Contact tracing',
                    'H4_Emergency investment in healthcare',
                    'H5_Investment in vaccines', 'H6_Facial Coverings'
                ],
                index_col='CountryName')

            # Create new index col
            df['FIPS'] = df['CountryCode'] + "_" + df['RegionCode'].astype(str)
            df = df.set_index('FIPS')

            # Use lowercase date so we ca
            df.rename(columns={'Date': 'date'}, inplace=True)

            df = df.where(df.notnull(), None)
            output_dfs.append(df)

        return TimeDependentFeatures(output_dfs,
                                     'jsi-OxCGRT',
                                     start_date,
                                     timedelta(days=1),
                                     feature_saver=saver_config.jsi_OxCGRT)
    def read_num_cases(self, start_date, end_date):
        df = pd.read_csv(config.labels_csv_path,
                         usecols=['date', 'fips', 'cases'],
                         dtype={
                             'fips': str
                         }).dropna().set_index('fips')

        output_dfs = []

        interval = timedelta(hyperparams.projection_days)

        cur_date = start_date
        while cur_date < end_date:
            df_old = df[df['date'] == str(cur_date - interval)]
            df_new = df[df['date'] == str(cur_date)]

            cur_df = df_old.merge(df_new,
                                  how='right',
                                  left_index=True,
                                  right_index=True,
                                  suffixes=('_start', '_end'))
            cur_df['new_cases'] = cur_df['cases_end'].subtract(
                cur_df['cases_start'], fill_value=0)
            cur_df.drop(['cases_end', 'cases_start', 'date_end', 'date_start'],
                        axis=1,
                        inplace=True)

            cur_df = cur_df[cur_df['new_cases'] >= 0]  # negatives are errors

            output_dfs.append(cur_df)

            logging.info('Processed num cases for ' + str(cur_date))

            cur_date += timedelta(days=1)

        return TimeDependentFeatures(output_dfs,
                                     'new_cases',
                                     start_date,
                                     timedelta(days=1),
                                     feature_saver=saver_config.num_cases)
    def read_reproduction_number(self, start_date, end_date):
        df = pd.read_csv(config.ri_csv_path, dtype={'fips': str})
        df = df.drop(
            columns=['state', 'county']).set_index('fips').sort_index()

        output_dfs = []

        d = start_date
        while d < end_date:
            if str(d) in df.columns:
                output_dfs.append(df[str(d)].to_frame().rename(columns={
                    str(d): 'rn'
                }).fillna(0))
            else:
                output_dfs.append(
                    pd.DataFrame(index=df.index, columns=['rn']).fillna(0))
            d += timedelta(1)

        return TimeDependentFeatures(
            output_dfs,
            'reproduction_index',
            start_date,
            timedelta(days=1),
            feature_saver=saver_config.reproduction_number)
    def read_sg_patterns_monthly(self, start_date, end_date):
        files = config.sg_patterns_monthly_reader.get_files_between(
            start_date, end_date)

        main_df = pd.DataFrame()

        logging.info('Reading Safegraph Patterns Monthly Data')

        for csv_file, month_start, month_end in files:

            index_start = month_start.day - 1
            index_end = (month_end - timedelta(1)).day

            df = pd.read_csv(
                csv_file,
                usecols=[
                    'safegraph_place_id', 'visits_by_day'
                    # bucketed_dwell_times may be useful to see
                    # how long people stayed
                ],
                converters={
                    'visits_by_day': (lambda x: np.array([
                        int(s) for s in re.split(r'[,\s]\s*', x.strip('[]'))
                    ])[index_start:index_end])
                })
            logging.info(f'Successfully read {csv_file}...')

            # decompose visits by day into different columns
            decomposed_visits_df = pd.DataFrame(
                df['visits_by_day'].values.tolist(),
                columns=self._get_names_starting_with(start_date, month_start,
                                                      month_end,
                                                      'visits_day_'))

            for c in decomposed_visits_df.columns:
                df[c] = decomposed_visits_df[c]

            df = df.drop(['visits_by_day'], axis=1)

            logging.info('Decomposed visits per day')

            # find FIPS and category of poi
            df['countyFIPS'] = df['safegraph_place_id'].apply(
                lambda x: self.poi_info[x]['countyFIPS'] if x in self.poi_info
                and self.poi_info[x]['countyFIPS'] else '00000')

            df['top_category'] = df['safegraph_place_id'].apply(
                lambda x: self.poi_info[x]['top_category'] if x in self.
                poi_info and self.poi_info[x]['top_category'] else 'Unknown')
            logging.info('Finished getting categories')

            top_cats = set()
            for k in self.poi_info:
                if isinstance(self.poi_info[k]['top_category'], str):
                    top_cats.add(self.poi_info[k]['top_category'])

            for cat in top_cats:
                colname = cat.translate(
                    str.maketrans('', '', string.punctuation)).lower().replace(
                        ' ', '_')
                for suffix in self._get_names_starting_with(
                        start_date, month_start, month_end, '_visits_day_'):
                    df[colname + suffix] = \
                        df[suffix[1:]] * (df['top_category'] == cat)

            logging.info('Finished creating category columns')

            df = df.groupby('countyFIPS').sum()

            logging.info('Finished grouping by FIPS code')

            common_cols = main_df.columns.intersection(df.columns)

            main_df = df.merge(main_df,
                               how='outer',
                               suffixes=('_l', '_r'),
                               left_index=True,
                               right_index=True)

            cols_to_remove = []
            for c in common_cols:
                main_df[c] = main_df[c + '_l'].add(main_df[c + '_r'],
                                                   fill_value=0)
                cols_to_remove.append(c + '_l')
                cols_to_remove.append(c + '_r')

            main_df.drop(cols_to_remove, axis=1, inplace=True)
            logging.info('Finished merging columns')

        output_dfs = []
        for col_suffix in self._get_names_starting_with(
                start_date, start_date, end_date, 'day_'):
            cols = [c for c in main_df.columns if c.endswith(col_suffix)]
            renamed_cols = {}
            for c in cols:
                renamed_cols[c] = c[:-len(col_suffix)]
            output_dfs.append(main_df[cols].rename(columns=renamed_cols))

        return \
            TimeDependentFeatures(output_dfs, 'sg_patterns_monthly', start_date,
                                  timedelta(days=1),
                                  feature_saver=saver_config.sg_patterns_monthly)