def replace_dc_county_with_state_data( dataset_in: timeseries.MultiRegionDataset, ) -> timeseries.MultiRegionDataset: """Replace DC County data with data from State. Args: dataset_in: Input dataset. Returns: Dataset with DC county data replaced to match DC state. """ dc_state_region = pipeline.Region.from_fips(DC_STATE_FIPS) dc_county_region = pipeline.Region.from_fips(DC_COUNTY_FIPS) dc_map = {dc_state_region: dc_county_region} # aggregate_regions only copies number columns. Extract them and re-add to the aggregated # dataset. static_excluding_numbers = dataset_in.get_regions_subset( [dc_county_region] ).static.select_dtypes(exclude="number") dc_county_dataset = timeseries.aggregate_regions(dataset_in, dc_map).add_static_values( static_excluding_numbers.reset_index() ) dataset_without_dc_county = dataset_in.remove_regions([dc_county_region]) return dataset_without_dc_county.append_regions(dc_county_dataset)
def update(aggregate_to_country: bool, state: Optional[str], fips: Optional[str]): """Updates latest and timeseries datasets to the current checked out covid data public commit""" path_prefix = dataset_utils.DATA_DIRECTORY.relative_to(dataset_utils.REPO_ROOT) timeseries_field_datasets = load_datasets_by_field( ALL_TIMESERIES_FEATURE_DEFINITION, state=state, fips=fips ) static_field_datasets = load_datasets_by_field( ALL_FIELDS_FEATURE_DEFINITION, state=state, fips=fips ) multiregion_dataset = timeseries.combined_datasets( timeseries_field_datasets, static_field_datasets ) # Filter for stalled cumulative values before deriving NEW_CASES from CASES. _, multiregion_dataset = TailFilter.run(multiregion_dataset, CUMULATIVE_FIELDS_TO_FILTER,) multiregion_dataset = zeros_filter.drop_all_zero_timeseries( multiregion_dataset, [ CommonFields.VACCINES_DISTRIBUTED, CommonFields.VACCINES_ADMINISTERED, CommonFields.VACCINATIONS_COMPLETED, CommonFields.VACCINATIONS_INITIATED, ], ) multiregion_dataset = timeseries.add_new_cases(multiregion_dataset) multiregion_dataset = timeseries.drop_new_case_outliers(multiregion_dataset) multiregion_dataset = timeseries.backfill_vaccination_initiated(multiregion_dataset) multiregion_dataset = timeseries.drop_regions_without_population( multiregion_dataset, KNOWN_LOCATION_ID_WITHOUT_POPULATION, structlog.get_logger() ) multiregion_dataset = timeseries.aggregate_puerto_rico_from_counties(multiregion_dataset) multiregion_dataset = custom_aggregations.aggregate_to_new_york_city(multiregion_dataset) multiregion_dataset = custom_aggregations.replace_dc_county_with_state_data(multiregion_dataset) aggregator = statistical_areas.CountyToCBSAAggregator.from_local_public_data() cbsa_dataset = aggregator.aggregate( multiregion_dataset, reporting_ratio_required_to_aggregate=DEFAULT_REPORTING_RATIO ) multiregion_dataset = multiregion_dataset.append_regions(cbsa_dataset) if aggregate_to_country: country_dataset = timeseries.aggregate_regions( multiregion_dataset, pipeline.us_states_to_country_map(), reporting_ratio_required_to_aggregate=DEFAULT_REPORTING_RATIO, ) multiregion_dataset = multiregion_dataset.append_regions(country_dataset) combined_dataset_utils.persist_dataset(multiregion_dataset, path_prefix)
def aggregate( self, dataset_in: MultiRegionDataset, reporting_ratio_required_to_aggregate=None ) -> MultiRegionDataset: """Returns a dataset of CBSA regions, created by aggregating counties in the input data.""" region_map = { pipeline.Region.from_fips(fips): pipeline.Region.from_cbsa_code(cbsa_code) for fips, cbsa_code in self.county_map.items() } return timeseries.aggregate_regions( dataset_in, region_map, self.aggregations, reporting_ratio_required_to_aggregate=reporting_ratio_required_to_aggregate, )
def aggregate_to_new_york_city( ds_in: timeseries.MultiRegionDataset, ) -> timeseries.MultiRegionDataset: nyc_region = pipeline.Region.from_fips(NEW_YORK_CITY_FIPS) # Map from borough / county to the region used for aggregated NYC nyc_map = {borough_region: nyc_region for borough_region in ALL_NYC_REGIONS} # aggregate_regions only copies number columns. Extract them and re-add to the aggregated # dataset. static_excluding_numbers = ds_in.get_regions_subset([nyc_region]).static.select_dtypes( exclude="number" ) nyc_dataset = timeseries.aggregate_regions( ds_in, nyc_map, reporting_ratio_required_to_aggregate=None ).add_static_values(static_excluding_numbers.reset_index()) return ds_in.append_regions(nyc_dataset)
def aggregate_states_to_country(output_path: pathlib.Path): us_timeseries = combined_datasets.load_us_timeseries_dataset() country_dataset = timeseries.aggregate_regions( us_timeseries, pipeline.us_states_to_country_map(), ) country_dataset.to_csv(output_path)