Example #1
0
def replace_dc_county_with_state_data(
    dataset_in: timeseries.MultiRegionDataset,
) -> timeseries.MultiRegionDataset:
    """Replace DC County data with data from State.

    Args:
        dataset_in: Input dataset.

    Returns: Dataset with DC county data replaced to match DC state.
    """
    dc_state_region = pipeline.Region.from_fips(DC_STATE_FIPS)
    dc_county_region = pipeline.Region.from_fips(DC_COUNTY_FIPS)

    dc_map = {dc_state_region: dc_county_region}

    # aggregate_regions only copies number columns. Extract them and re-add to the aggregated
    # dataset.
    static_excluding_numbers = dataset_in.get_regions_subset(
        [dc_county_region]
    ).static.select_dtypes(exclude="number")
    dc_county_dataset = timeseries.aggregate_regions(dataset_in, dc_map).add_static_values(
        static_excluding_numbers.reset_index()
    )
    dataset_without_dc_county = dataset_in.remove_regions([dc_county_region])

    return dataset_without_dc_county.append_regions(dc_county_dataset)
Example #2
0
def update(aggregate_to_country: bool, state: Optional[str], fips: Optional[str]):
    """Updates latest and timeseries datasets to the current checked out covid data public commit"""
    path_prefix = dataset_utils.DATA_DIRECTORY.relative_to(dataset_utils.REPO_ROOT)

    timeseries_field_datasets = load_datasets_by_field(
        ALL_TIMESERIES_FEATURE_DEFINITION, state=state, fips=fips
    )
    static_field_datasets = load_datasets_by_field(
        ALL_FIELDS_FEATURE_DEFINITION, state=state, fips=fips
    )

    multiregion_dataset = timeseries.combined_datasets(
        timeseries_field_datasets, static_field_datasets
    )
    # Filter for stalled cumulative values before deriving NEW_CASES from CASES.
    _, multiregion_dataset = TailFilter.run(multiregion_dataset, CUMULATIVE_FIELDS_TO_FILTER,)
    multiregion_dataset = zeros_filter.drop_all_zero_timeseries(
        multiregion_dataset,
        [
            CommonFields.VACCINES_DISTRIBUTED,
            CommonFields.VACCINES_ADMINISTERED,
            CommonFields.VACCINATIONS_COMPLETED,
            CommonFields.VACCINATIONS_INITIATED,
        ],
    )
    multiregion_dataset = timeseries.add_new_cases(multiregion_dataset)
    multiregion_dataset = timeseries.drop_new_case_outliers(multiregion_dataset)
    multiregion_dataset = timeseries.backfill_vaccination_initiated(multiregion_dataset)
    multiregion_dataset = timeseries.drop_regions_without_population(
        multiregion_dataset, KNOWN_LOCATION_ID_WITHOUT_POPULATION, structlog.get_logger()
    )
    multiregion_dataset = timeseries.aggregate_puerto_rico_from_counties(multiregion_dataset)
    multiregion_dataset = custom_aggregations.aggregate_to_new_york_city(multiregion_dataset)
    multiregion_dataset = custom_aggregations.replace_dc_county_with_state_data(multiregion_dataset)

    aggregator = statistical_areas.CountyToCBSAAggregator.from_local_public_data()
    cbsa_dataset = aggregator.aggregate(
        multiregion_dataset, reporting_ratio_required_to_aggregate=DEFAULT_REPORTING_RATIO
    )
    multiregion_dataset = multiregion_dataset.append_regions(cbsa_dataset)

    if aggregate_to_country:
        country_dataset = timeseries.aggregate_regions(
            multiregion_dataset,
            pipeline.us_states_to_country_map(),
            reporting_ratio_required_to_aggregate=DEFAULT_REPORTING_RATIO,
        )
        multiregion_dataset = multiregion_dataset.append_regions(country_dataset)

    combined_dataset_utils.persist_dataset(multiregion_dataset, path_prefix)
    def aggregate(
        self, dataset_in: MultiRegionDataset, reporting_ratio_required_to_aggregate=None
    ) -> MultiRegionDataset:
        """Returns a dataset of CBSA regions, created by aggregating counties in the input data."""
        region_map = {
            pipeline.Region.from_fips(fips): pipeline.Region.from_cbsa_code(cbsa_code)
            for fips, cbsa_code in self.county_map.items()
        }

        return timeseries.aggregate_regions(
            dataset_in,
            region_map,
            self.aggregations,
            reporting_ratio_required_to_aggregate=reporting_ratio_required_to_aggregate,
        )
Example #4
0
def aggregate_to_new_york_city(
    ds_in: timeseries.MultiRegionDataset,
) -> timeseries.MultiRegionDataset:
    nyc_region = pipeline.Region.from_fips(NEW_YORK_CITY_FIPS)
    # Map from borough / county to the region used for aggregated NYC
    nyc_map = {borough_region: nyc_region for borough_region in ALL_NYC_REGIONS}

    # aggregate_regions only copies number columns. Extract them and re-add to the aggregated
    # dataset.
    static_excluding_numbers = ds_in.get_regions_subset([nyc_region]).static.select_dtypes(
        exclude="number"
    )
    nyc_dataset = timeseries.aggregate_regions(
        ds_in, nyc_map, reporting_ratio_required_to_aggregate=None
    ).add_static_values(static_excluding_numbers.reset_index())

    return ds_in.append_regions(nyc_dataset)
Example #5
0
def aggregate_states_to_country(output_path: pathlib.Path):
    us_timeseries = combined_datasets.load_us_timeseries_dataset()
    country_dataset = timeseries.aggregate_regions(
        us_timeseries, pipeline.us_states_to_country_map(),
    )
    country_dataset.to_csv(output_path)