Beispiel #1
0
def replace_dc_county_with_state_data(
    dataset_in: timeseries.MultiRegionDataset,
) -> timeseries.MultiRegionDataset:
    """Replace DC County data with data from State.

    Args:
        dataset_in: Input dataset.

    Returns: Dataset with DC county data replaced to match DC state.
    """
    dc_state_region = pipeline.Region.from_fips(DC_STATE_FIPS)
    dc_county_region = pipeline.Region.from_fips(DC_COUNTY_FIPS)

    dc_map = {dc_state_region: dc_county_region}

    # aggregate_regions only copies number columns. Extract them and re-add to the aggregated
    # dataset.
    dataset_with_dc_county, dataset_without_dc_county = dataset_in.partition_by_region(
        [dc_county_region])
    static_excluding_numbers = dataset_with_dc_county.static.select_dtypes(
        exclude="number")
    dc_county_dataset = region_aggregation.aggregate_regions(
        dataset_in,
        dc_map).add_static_values(static_excluding_numbers.reset_index())

    return dataset_without_dc_county.append_regions(dc_county_dataset)
Beispiel #2
0
 def aggregate(
     self, dataset_in: MultiRegionDataset, reporting_ratio_required_to_aggregate=None
 ) -> MultiRegionDataset:
     """Returns a dataset of CBSA regions, created by aggregating counties in the input data."""
     return region_aggregation.aggregate_regions(
         dataset_in,
         self.county_to_cbsa_region_map,
         self.aggregations,
         reporting_ratio_required_to_aggregate=reporting_ratio_required_to_aggregate,
     )
Beispiel #3
0
def aggregate_states_to_country():
    dataset = timeseries.MultiRegionDataset.from_wide_dates_csv(
        pathlib.Path("data/pre-agg-wide-dates.csv")).add_static_csv_file(
            pathlib.Path("data/pre-agg-static.csv"))
    dataset = region_aggregation.aggregate_regions(
        dataset,
        pipeline.us_states_and_territories_to_country_map(),
        reporting_ratio_required_to_aggregate=DEFAULT_REPORTING_RATIO,
    )
    dataset.write_to_wide_dates_csv(
        pathlib.Path("data/post-agg-wide-dates.csv"),
        pathlib.Path("data/post-agg-static.csv"))
Beispiel #4
0
def aggregate_to_new_york_city(
    ds_in: timeseries.MultiRegionDataset, ) -> timeseries.MultiRegionDataset:
    nyc_region = pipeline.Region.from_fips(NEW_YORK_CITY_FIPS)
    # Map from borough / county to the region used for aggregated NYC
    nyc_map = {
        borough_region: nyc_region
        for borough_region in ALL_NYC_REGIONS
    }

    # aggregate_regions only copies number columns. Extract them and re-add to the aggregated
    # dataset.
    static_excluding_numbers = ds_in.get_regions_subset(
        [nyc_region]).static.select_dtypes(exclude="number")
    nyc_dataset = region_aggregation.aggregate_regions(
        ds_in, nyc_map,
        reporting_ratio_required_to_aggregate=None).add_static_values(
            static_excluding_numbers.reset_index())

    return ds_in.append_regions(nyc_dataset)
Beispiel #5
0
def update(aggregate_to_country: bool, vaccine_estimate: bool,
           state: Optional[str], fips: Optional[str]):
    """Updates latest and timeseries datasets to the current checked out covid data public commit"""
    path_prefix = dataset_utils.DATA_DIRECTORY.relative_to(
        dataset_utils.REPO_ROOT)

    timeseries_field_datasets = load_datasets_by_field(
        ALL_TIMESERIES_FEATURE_DEFINITION, state=state, fips=fips)
    static_field_datasets = load_datasets_by_field(
        ALL_FIELDS_FEATURE_DEFINITION, state=state, fips=fips)

    multiregion_dataset = timeseries.combined_datasets(
        timeseries_field_datasets, static_field_datasets)
    _logger.info("Finished combining datasets")

    # Apply manual overrides (currently only removing timeseries) before aggregation so we don't
    # need to remove CBSAs because they don't exist yet.
    aggregator = statistical_areas.CountyToCBSAAggregator.from_local_public_data(
    )
    region_overrides_config = manual_filter.transform_region_overrides(
        json.load(open(REGION_OVERRIDES_JSON)),
        aggregator.cbsa_to_counties_region_map)
    multiregion_dataset = manual_filter.run(multiregion_dataset,
                                            region_overrides_config)

    multiregion_dataset.print_stats("combined")
    multiregion_dataset = outlier_detection.drop_tail_positivity_outliers(
        multiregion_dataset)
    multiregion_dataset.print_stats("drop_tail")
    # Filter for stalled cumulative values before deriving NEW_CASES from CASES.
    _, multiregion_dataset = TailFilter.run(multiregion_dataset,
                                            CUMULATIVE_FIELDS_TO_FILTER)
    multiregion_dataset.print_stats("TailFilter")
    multiregion_dataset = zeros_filter.drop_all_zero_timeseries(
        multiregion_dataset,
        [
            CommonFields.VACCINES_DISTRIBUTED,
            CommonFields.VACCINES_ADMINISTERED,
            CommonFields.VACCINATIONS_COMPLETED,
            CommonFields.VACCINATIONS_INITIATED,
        ],
    )
    multiregion_dataset.print_stats("zeros_filter")
    multiregion_dataset = vaccine_backfills.backfill_vaccination_initiated(
        multiregion_dataset)
    multiregion_dataset.print_stats("backfill_vaccination_initiated")

    if vaccine_estimate:
        multiregion_dataset = vaccine_backfills.estimate_initiated_from_state_ratio(
            multiregion_dataset)
        multiregion_dataset.print_stats("estimate_initiated_from_state_ratio")

    multiregion_dataset = new_cases_and_deaths.add_new_cases(
        multiregion_dataset)
    multiregion_dataset = new_cases_and_deaths.add_new_deaths(
        multiregion_dataset)

    multiregion_dataset = outlier_detection.drop_new_case_outliers(
        multiregion_dataset)
    multiregion_dataset = outlier_detection.drop_new_deaths_outliers(
        multiregion_dataset)

    multiregion_dataset = timeseries.drop_regions_without_population(
        multiregion_dataset, KNOWN_LOCATION_ID_WITHOUT_POPULATION,
        structlog.get_logger())
    multiregion_dataset.print_stats("drop_regions_without_population")

    multiregion_dataset = custom_aggregations.aggregate_puerto_rico_from_counties(
        multiregion_dataset)
    multiregion_dataset.print_stats("aggregate_puerto_rico_from_counties")
    multiregion_dataset = custom_aggregations.aggregate_to_new_york_city(
        multiregion_dataset)
    multiregion_dataset.print_stats("aggregate_to_new_york_city")
    multiregion_dataset = custom_aggregations.replace_dc_county_with_state_data(
        multiregion_dataset)
    multiregion_dataset.print_stats("replace_dc_county_with_state_data")

    cbsa_dataset = aggregator.aggregate(
        multiregion_dataset,
        reporting_ratio_required_to_aggregate=DEFAULT_REPORTING_RATIO)
    multiregion_dataset = multiregion_dataset.append_regions(cbsa_dataset)
    multiregion_dataset.print_stats("CountyToCBSAAggregator")

    # TODO(tom): Add a clean way to store intermediate values instead of commenting out code like
    #  this:
    # multiregion_dataset.write_to_wide_dates_csv(
    #     pathlib.Path("data/pre-agg-wide-dates.csv"), pathlib.Path("data/pre-agg-static.csv")
    # )
    if aggregate_to_country:
        country_dataset = region_aggregation.aggregate_regions(
            multiregion_dataset,
            pipeline.us_states_and_territories_to_country_map(),
            reporting_ratio_required_to_aggregate=DEFAULT_REPORTING_RATIO,
        )
        multiregion_dataset = multiregion_dataset.append_regions(
            country_dataset)
        multiregion_dataset.print_stats("aggregate_to_country")

    combined_dataset_utils.persist_dataset(multiregion_dataset, path_prefix)
    multiregion_dataset.print_stats("persist")