def aggregate_puerto_rico_from_counties( dataset: timeseries.MultiRegionDataset, ) -> timeseries.MultiRegionDataset: """Returns a dataset with NA static values for the state PR aggregated from counties.""" pr_counties = dataset.get_subset(AggregationLevel.COUNTY, state="PR") if pr_counties.location_ids.empty: return dataset aggregated = _aggregate_ignoring_nas( pr_counties.static.select_dtypes(include="number")) pr_location_id = pipeline.Region.from_state("PR").location_id patched_static = dataset.static.copy() for field, aggregated_value in aggregated.items(): if pd.isna(patched_static.at[pr_location_id, field]): patched_static.at[pr_location_id, field] = aggregated_value return dataclasses.replace(dataset, static=patched_static)
def derive_ca_county_vaccine_pct( ds_in: MultiRegionDataset) -> MultiRegionDataset: """Derives vaccination metrics for CA counties based on State 1st vs 2nd dose reporting.""" ca_county_dataset = ds_in.get_subset( aggregation_level=AggregationLevel.COUNTY, state="CA") # Get county level time-series in distribution bucket "all". Keep the bucket in the index so # that the concat at the bottom of this function has the correct labels for each time-series. ca_county_wide = ca_county_dataset.timeseries_bucketed_wide_dates.xs( DemographicBucket.ALL, level=PdFields.DEMOGRAPHIC_BUCKET, drop_level=False) fields_to_check = [ CommonFields.VACCINATIONS_INITIATED, CommonFields.VACCINATIONS_COMPLETED, CommonFields.VACCINATIONS_INITIATED_PCT, CommonFields.VACCINATIONS_COMPLETED_PCT, ] # Assert that possible fields we want to estimate are all NA - if one of these is # not NA, likely do not need to estimate anymore and this methodology can be removed. assert ca_county_wide.loc[(slice(None), fields_to_check), :].isna().all().all() ca_state_wide = ds_in.get_regions_subset([ Region.from_state("CA") ]).timeseries_bucketed_wide_dates.xs(DemographicBucket.ALL, level=PdFields.DEMOGRAPHIC_BUCKET, drop_level=False) # Drop location index because not used to apply to county level data ca_state_wide = ca_state_wide.droplevel(CommonFields.LOCATION_ID) ca_administered = ca_state_wide.loc( axis=0)[CommonFields.VACCINES_ADMINISTERED] initiated_ratio_of_administered = ( ca_state_wide.loc(axis=0)[CommonFields.VACCINATIONS_INITIATED] / ca_administered) completed_ratio_of_administered = ( ca_state_wide.loc(axis=0)[CommonFields.VACCINATIONS_COMPLETED] / ca_administered) county_administered = ca_county_wide.loc( axis=0)[:, CommonFields.VACCINES_ADMINISTERED] estimated_initiated = county_administered * initiated_ratio_of_administered estimated_completed = county_administered * completed_ratio_of_administered vaccines_initiated_pct = (estimated_initiated.div( ca_county_dataset.static.loc[:, CommonFields.POPULATION], level=CommonFields.LOCATION_ID, axis="index", ) * 100) vaccines_initiated_pct = vaccines_initiated_pct.rename( index={ CommonFields.VACCINES_ADMINISTERED: CommonFields.VACCINATIONS_INITIATED_PCT }, level=PdFields.VARIABLE, ) vaccines_completed_pct = (estimated_completed.div( ca_county_dataset.static.loc[:, CommonFields.POPULATION], level=CommonFields.LOCATION_ID, axis="index", ) * 100) vaccines_completed_pct = vaccines_completed_pct.rename( index={ CommonFields.VACCINES_ADMINISTERED: CommonFields.VACCINATIONS_COMPLETED_PCT }, level=PdFields.VARIABLE, ) all_wide = ds_in.timeseries_bucketed_wide_dates # Because we assert that existing dataset does not have CA county VACCINATIONS_COMPLETED_PCT # or VACCINATIONS_INITIATED_PCT we can safely combine the existing rows with new derived rows return ds_in.replace_timeseries_wide_dates( [vaccines_completed_pct, vaccines_initiated_pct, all_wide])