def drop_observations(dataset: timeseries.MultiRegionDataset, filter_: Filter) -> timeseries.MultiRegionDataset: """Drops observations according to `filter_` from every region in `dataset`.""" assert filter_.drop_observations # wide-dates DataFrames that will be concat-ed to produce the result MultiRegionDataset. ts_results = [] ts_selected_fields, ts_not_selected_fields = _partition_by_fields( dataset.timeseries_bucketed_wide_dates, filter_.fields_included) ts_results.append(ts_not_selected_fields) if filter_.start_date: ts_filtered, ts_no_real_values_to_drop = _filter_by_date( ts_selected_fields, drop_start_date=filter_.start_date) return dataset.replace_timeseries_wide_dates([ ts_not_selected_fields, ts_filtered, ts_no_real_values_to_drop ]).add_tag_to_subset(filter_.tag, ts_filtered.index) else: # When start_date is None all of ts_selected_fields is dropped; only the not selected # fields are kept. selected_index = ts_selected_fields.index return (dataset.replace_timeseries_wide_dates([ ts_not_selected_fields ]).remove_tags_from_subset(selected_index).add_tag_to_subset( filter_.tag, selected_index))
def replace_dc_county_with_state_data( dataset_in: timeseries.MultiRegionDataset, ) -> timeseries.MultiRegionDataset: """Replace DC County data with data from State. Args: dataset_in: Input dataset. Returns: Dataset with DC county data replaced to match DC state. """ dc_state_region = pipeline.Region.from_fips(DC_STATE_FIPS) dc_county_region = pipeline.Region.from_fips(DC_COUNTY_FIPS) dc_map = {dc_state_region: dc_county_region} # aggregate_regions only copies number columns. Extract them and re-add to the aggregated # dataset. static_excluding_numbers = dataset_in.get_regions_subset( [dc_county_region] ).static.select_dtypes(exclude="number") dc_county_dataset = timeseries.aggregate_regions(dataset_in, dc_map).add_static_values( static_excluding_numbers.reset_index() ) dataset_without_dc_county = dataset_in.remove_regions([dc_county_region]) return dataset_without_dc_county.append_regions(dc_county_dataset)
def persist_dataset( dataset: timeseries.MultiRegionDataset, data_directory: pathlib.Path, data_public_path: pathlib.Path = dataset_utils.LOCAL_PUBLIC_DATA_PATH, ) -> DatasetPointer: """Saves dataset and associated pointer in same data directory. Args: dataset: Dataset to persist. data_directory: Data directory data_public_path: Path to covid data public folder. Returns: DatasetPointer describing persisted dataset. """ model_git_info = GitSummary.from_repo_path(dataset_utils.REPO_ROOT) data_git_info = GitSummary.from_repo_path(data_public_path) dataset_type = dataset.dataset_type dataset_path = data_directory / f"{dataset_type.value}.csv" dataset_pointer = DatasetPointer( dataset_type=dataset_type, path=dataset_path, data_git_info=data_git_info, model_git_info=model_git_info, updated_at=datetime.datetime.utcnow(), ) dataset.write_to_dataset_pointer(dataset_pointer) dataset_pointer.save(data_directory) return dataset_pointer
def from_region_and_model_output( region: pipeline.Region, combined_data_with_test_positivity: MultiRegionDataset, rt_data: MultiRegionDataset, icu_data: MultiRegionDataset, ) -> "RegionalInput": one_region_data = combined_data_with_test_positivity.get_one_region( region) # Not all regions have Rt or ICU data due to various filters in pyseir code. try: rt_data = rt_data.get_one_region(region) except timeseries.RegionLatestNotFound: rt_data = None try: icu_data = icu_data.get_one_region(region) except timeseries.RegionLatestNotFound: icu_data = None return RegionalInput( region=region, _combined_data_with_test_positivity=one_region_data, rt_data=rt_data, icu_data=icu_data, )
def read(output_dir: pathlib.Path) -> "PyseirOutputDatasets": icu_data_path = output_dir / SummaryArtifact.ICU_METRIC_COMBINED.value icu_data = MultiRegionDataset.from_csv(icu_data_path) rt_data_path = output_dir / SummaryArtifact.RT_METRIC_COMBINED.value rt_data = MultiRegionDataset.from_csv(rt_data_path) return PyseirOutputDatasets(icu=icu_data, infection_rate=rt_data)
def from_pipeline_output(pipelines: List[OneRegionPipeline]) -> "PyseirOutputDatasets": infection_rate_metric_df = pd.concat((p.infer_df for p in pipelines), ignore_index=True) infection_rate_ds = MultiRegionDataset.from_geodata_timeseries_df(infection_rate_metric_df) icu_df = pd.concat((p.icu_data.data for p in pipelines if p.icu_data), ignore_index=True) icu_ds = MultiRegionDataset.from_geodata_timeseries_df(icu_df) return PyseirOutputDatasets(icu=icu_ds, infection_rate=infection_rate_ds)
def load_us_timeseries_dataset( pointer_directory: pathlib.Path = dataset_utils.DATA_DIRECTORY, ) -> MultiRegionDataset: filename = dataset_pointer.form_filename(DatasetType.MULTI_REGION) pointer_path = pointer_directory / filename pointer = DatasetPointer.parse_raw(pointer_path.read_text()) return MultiRegionDataset.read_from_pointer(pointer)
def calculate(self, dataset: MultiRegionDataset, diff_days: int, most_recent_date: pd.Timestamp) -> MethodOutput: positivity_time_series = {} # To replicate the behavior of the old code, a region is considered to have recent # positivity when the input timeseries (POSITIVE_TESTS and NEGATIVE_TESTS) are recent. The # other subclasses of `Method` filter based on the most recent real value in the output # timeseries. recent_regions = [] for region, regional_data in dataset.iter_one_regions(): data = regional_data.date_indexed positive_negative_recent = self._has_recent_data( data[CommonFields.POSITIVE_TESTS]) and self._has_recent_data( data[CommonFields.NEGATIVE_TESTS]) series = calculate_test_positivity(regional_data) if not series.empty: positivity_time_series[region.location_id] = series if positive_negative_recent: recent_regions.append(region) # Convert dict[location_id, Series] to rows with key as index and value as the row data. # See https://stackoverflow.com/a/21005134 wide_date_df = pd.concat(positivity_time_series, axis=1).T.rename_axis( index=CommonFields.LOCATION_ID, columns=CommonFields.DATE) # Make a dataset with TEST_POSITIVITY for every region where the calculation finished. all_output = _make_output_dataset( dataset, self.columns, wide_date_df, CommonFields.TEST_POSITIVITY, ) # Make a dataset with the subset of regions having recent input timeseries. ds_recent = all_output.get_regions_subset(recent_regions) return MethodOutput(all_output=all_output, recent=ds_recent)
def test_aggregate(): df_in = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,m1,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,11\n" "55005,ZZ,county,North County,2,2020-05-02,22\n" "55005,ZZ,county,North County,3,2020-05-03,33\n" "55005,ZZ,county,North County,0,2020-05-04,0\n" "55006,ZZ,county,South County,0,2020-05-01,0\n" "55006,ZZ,county,South County,0,2020-05-02,0\n" "55006,ZZ,county,South County,3,2020-05-03,44\n" "55006,ZZ,county,South County,4,2020-05-04,55\n" "55,ZZ,state,Grand State,41,2020-05-01,66\n" "55,ZZ,state,Grand State,43,2020-05-03,77\n" ).reset_index() ts_in = MultiRegionDataset.from_fips_timeseries_df(df_in) agg = statistical_areas.CountyToCBSAAggregator( county_map={"55005": "10001", "55006": "10001"}, cbsa_title_map={"10001": "Stat Area 1"}, aggregations=[], ) ts_out = agg.aggregate(ts_in) assert ts_out.groupby_region().ngroups == 1 ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10001")) assert ts_cbsa.date_indexed["m1"].to_dict() == { pd.to_datetime("2020-05-01"): 1, pd.to_datetime("2020-05-02"): 2, pd.to_datetime("2020-05-03"): 6, pd.to_datetime("2020-05-04"): 4, }
def run( dataset: timeseries.MultiRegionDataset, fields: List[FieldName] ) -> Tuple["TailFilter", timeseries.MultiRegionDataset]: """Returns a dataset with recent data that looks bad removed from cumulative fields.""" timeseries_wide_dates = dataset.timeseries_wide_dates() fields_mask = timeseries_wide_dates.index.get_level_values( PdFields.VARIABLE).isin(fields) to_filter = timeseries_wide_dates.loc[pd.IndexSlice[:, fields_mask], :] not_filtered = timeseries_wide_dates.loc[ pd.IndexSlice[:, ~fields_mask], :] tail_filter = TailFilter() filtered = to_filter.apply(tail_filter._filter_one_series, axis=1) merged = pd.concat([not_filtered, filtered]) timeseries_wide_variables = merged.stack().unstack( PdFields.VARIABLE).sort_index() # TODO(tom): Find a generic way to return the counts in tail_filter and stop returning the # object itself. return ( tail_filter, dataclasses.replace( dataset, timeseries=timeseries_wide_variables).append_tag_df( tail_filter._annotations.as_dataframe()), )
def aggregate_to_new_york_city( ds_in: timeseries.MultiRegionDataset, ) -> timeseries.MultiRegionDataset: nyc_region = pipeline.Region.from_fips(NEW_YORK_CITY_FIPS) # Map from borough / county to the region used for aggregated NYC nyc_map = {borough_region: nyc_region for borough_region in ALL_NYC_REGIONS} # aggregate_regions only copies number columns. Extract them and re-add to the aggregated # dataset. static_excluding_numbers = ds_in.get_regions_subset([nyc_region]).static.select_dtypes( exclude="number" ) nyc_dataset = timeseries.aggregate_regions( ds_in, nyc_map, reporting_ratio_required_to_aggregate=None ).add_static_values(static_excluding_numbers.reset_index()) return ds_in.append_regions(nyc_dataset)
def load_us_timeseries_dataset( pointer_directory: pathlib.Path = dataset_utils.DATA_DIRECTORY, ) -> MultiRegionDataset: """Returns all combined data. `load_test_dataset` is more suitable for tests.""" filename = dataset_pointer.form_filename(DatasetType.MULTI_REGION) pointer_path = pointer_directory / filename pointer = DatasetPointer.parse_raw(pointer_path.read_text()) return MultiRegionDataset.read_from_pointer(pointer)
def run( dataset_in: MultiRegionDataset, methods: Sequence[Method] = TEST_POSITIVITY_METHODS, *, diff_days: int = 7, ) -> "AllMethods": """Runs `methods` on `dataset_in` and returns the results or raises a TestPositivityException.""" relevant_columns = AllMethods._list_columns( AllMethods._methods_with_columns_available( methods, dataset_in.timeseries.columns)) if not relevant_columns: raise NoMethodsWithRelevantColumns() input_wide = dataset_in.timeseries_wide_dates() if input_wide.empty: raise NoRealTimeseriesValuesException() dates = input_wide.columns.get_level_values(CommonFields.DATE) most_recent_date = dates.max() methods_with_data = AllMethods._methods_with_columns_available( methods, input_wide.index.get_level_values(PdFields.VARIABLE).unique()) if not methods_with_data: raise NoColumnsWithDataException() method_map = {method.name: method for method in methods_with_data} calculated_dataset_map = { method_name: method.calculate(dataset_in, diff_days, most_recent_date) for method_name, method in method_map.items() } calculated_dataset_recent_map = { name: method_output.recent for name, method_output in calculated_dataset_map.items() } calculated_dataset_all_map = { name: method_output.all_output for name, method_output in calculated_dataset_map.items() } # HACK: If SmoothedTests is in calculated_dataset_map (that is the MethodOutput returned by # `calculate`) then add it again at the end of the map with the Method.all_output. Remember # that dict entries remain in the order inserted. This makes # SmoothedTests the final # fallback for a location if no other Method has a timeseries for it. old_method_output: Optional[MethodOutput] = calculated_dataset_map.get( timeseries.DatasetName("SmoothedTests")) if old_method_output: calculated_dataset_recent_map[timeseries.DatasetName( "SmoothedTestsAll")] = old_method_output.all_output # Make a dataset object with one metric, containing for each region the timeseries # from the highest priority dataset that has recent data. test_positivity = timeseries.combined_datasets( { CommonFields.TEST_POSITIVITY: list(calculated_dataset_recent_map.values()) }, {}) return AllMethods(all_methods_datasets=calculated_dataset_all_map, test_positivity=test_positivity)
def make_dataset(cls) -> timeseries.MultiRegionDataset: """Default implementation of make_dataset that loads timeseries data from a CSV.""" assert cls.COMMON_DF_CSV_PATH, f"No path in {cls}" data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH input_path = data_root / cls.COMMON_DF_CSV_PATH data = common_df.read_csv(input_path, set_index=False) data = cls._check_data(data) return MultiRegionDataset.from_fips_timeseries_df( data).add_provenance_all(cls.SOURCE_NAME)
def _latest_sorted_by_location_date( ts: timeseries.MultiRegionDataset, drop_na: bool ) -> pd.DataFrame: """Returns the latest data, sorted by LOCATION_ID.""" df = ts.static_and_timeseries_latest_with_fips().sort_values( [CommonFields.LOCATION_ID], ignore_index=True ) if drop_na: df = df.dropna("columns", "all") return df
def run_and_maybe_join_columns(mrts: timeseries.MultiRegionDataset, log) -> timeseries.MultiRegionDataset: """Calculates test positivity and joins it with the input, if successful.""" try: test_positivity_results = AllMethods.run(mrts) except TestPositivityException: log.exception("test_positivity failed") return mrts return mrts.join_columns(test_positivity_results.test_positivity)
def _fips_csv_to_one_region(csv_str: str, region: Region, latest=None) -> OneRegionTimeseriesDataset: df = read_csv_and_index_fips_date(csv_str).reset_index() # from_timeseries_and_latest adds the location_id column needed by get_one_region dataset = MultiRegionDataset.from_fips_timeseries_df(df).get_one_region( region) if latest: return dataclasses.replace(dataset, latest=latest) else: return dataset
def _remove_stale_regions(self, all_output: MultiRegionDataset, most_recent_date: pd.Timestamp) -> MethodOutput: """Filters the output for all regions, returning output for all regions and only those with recent data.""" assert self.recent_days >= 1 # The oldest date that is considered recent/not-stale. If recent_days is 1 then this is # the most recent day in the input. recent_date_cutoff = most_recent_date + pd.to_timedelta( 1 - self.recent_days, "D") return MethodOutput( all_output=all_output, recent=all_output.drop_stale_timeseries(recent_date_cutoff))
def test_persist_and_load_dataset(tmp_path, nyc_fips): region = Region.from_fips(nyc_fips) dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = dataset.get_regions_subset([region]) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = MultiRegionDataset.read_from_pointer(pointer) differ_l = DatasetDiff.make(downloaded_dataset.timeseries) differ_r = DatasetDiff.make(timeseries_nyc.timeseries) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def calculate( self, dataset: MultiRegionDataset, diff_days: int, most_recent_date: pd.Timestamp ) -> MethodOutput: numerator_delta = dataset.get_timeseries_not_bucketed_wide_dates(self._numerator).diff( periods=diff_days, axis=1 ) assert numerator_delta.index.names == [CommonFields.LOCATION_ID] assert numerator_delta.columns.names == [CommonFields.DATE] denomintaor_delta = dataset.get_timeseries_not_bucketed_wide_dates(self._denominator).diff( periods=diff_days, axis=1 ) assert denomintaor_delta.index.names == [CommonFields.LOCATION_ID] assert denomintaor_delta.columns.names == [CommonFields.DATE] # `/` is calculated for each region/state and date. wide_date_df = numerator_delta / denomintaor_delta all_output = _make_output_dataset( dataset, self.columns, wide_date_df, CommonFields.TEST_POSITIVITY, ) return self._remove_stale_regions(all_output, most_recent_date)
def calculate( self, dataset: MultiRegionDataset, diff_days: int, most_recent_date: pd.Timestamp ) -> MethodOutput: wide_date_df = dataset.get_timeseries_not_bucketed_wide_dates(self._column) assert wide_date_df.index.names == [CommonFields.LOCATION_ID] assert wide_date_df.columns.names == [CommonFields.DATE] # Optional optimization: The following likely adds the variable/field/column name back in # to the index which was just taken out. Consider skipping reindexing. all_output = _make_output_dataset( dataset, self.columns, wide_date_df, CommonFields.TEST_POSITIVITY ) return self._remove_stale_regions(all_output, most_recent_date)
def make_dataset(cls) -> timeseries.MultiRegionDataset: """Default implementation of make_dataset that loads data from the parquet file.""" ccd_dataset = cls._get_covid_county_dataset() rows, source_df = ccd_dataset.query_multiple_variables( # pylint: disable=E1101 cls.VARIABLES, log_provider_coverage_warnings=True, source_type=cls.SOURCE_TYPE, ) data = rows.unstack(PdFields.VARIABLE) data = cls._check_and_removed_unexpected_data(data) ds = MultiRegionDataset(timeseries_bucketed=data) if not source_df.empty: # For each FIPS-VARIABLE pair keep the source_url row with the last DATE. source_tag_df = ( source_df.sort_values(CommonFields.DATE).groupby( [CommonFields.LOCATION_ID, PdFields.VARIABLE], sort=False).last().reset_index().drop( columns=[CommonFields.DATE]) # copy before calling tag_df_add_all_bucket_in_place, just to be safe. .copy()) timeseries.tag_df_add_all_bucket_in_place(source_tag_df) ds = ds.append_tag_df(source_tag_df) return ds
def calculate(self, dataset: MultiRegionDataset, diff_days: int, most_recent_date: pd.Timestamp) -> MethodOutput: df = dataset.timeseries_wide_dates().reorder_levels( [PdFields.VARIABLE, CommonFields.LOCATION_ID]) assert df.columns.names == [CommonFields.DATE] assert df.index.names == [PdFields.VARIABLE, CommonFields.LOCATION_ID] # df has the field name as the first level of the index. delta_df.loc[field, :] returns a # DataFrame without the field label wide_date_df = df.loc[self._column, :] # Optional optimization: The following likely adds the variable/field/column name back in # to the index which was just taken out. Consider skipping reindexing. all_output = _make_output_dataset(dataset, self.columns, wide_date_df, CommonFields.TEST_POSITIVITY) return self._remove_stale_regions(all_output, most_recent_date)
def aggregate_puerto_rico_from_counties( dataset: timeseries.MultiRegionDataset, ) -> timeseries.MultiRegionDataset: """Returns a dataset with NA static values for the state PR aggregated from counties.""" pr_counties = dataset.get_subset(AggregationLevel.COUNTY, state="PR") if pr_counties.location_ids.empty: return dataset aggregated = _aggregate_ignoring_nas( pr_counties.static.select_dtypes(include="number")) pr_location_id = pipeline.Region.from_state("PR").location_id patched_static = dataset.static.copy() for field, aggregated_value in aggregated.items(): if pd.isna(patched_static.at[pr_location_id, field]): patched_static.at[pr_location_id, field] = aggregated_value return dataclasses.replace(dataset, static=patched_static)
def test_load_from_local_public_data(): agg = statistical_areas.CountyToCBSAAggregator.from_local_public_data() agg = dataclasses.replace(agg, aggregations=[]) # Disable scaled aggregations assert agg.cbsa_title_map["43580"] == "Sioux City, IA-NE-SD" assert agg.county_map["48187"] == "41700" df_in = read_csv_and_index_fips_date( "fips,state,aggregate_level,county,m1,date,foo\n" "48059,ZZ,county,North County,3,2020-05-03,33\n" "48253,ZZ,county,South County,4,2020-05-03,77\n" "48441,ZZ,county,Other County,2,2020-05-03,41\n" ).reset_index() ts_in = MultiRegionDataset.from_fips_timeseries_df(df_in) ts_out = agg.aggregate(ts_in) ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10180")) assert ts_cbsa.date_indexed["m1"].to_dict() == { pd.to_datetime("2020-05-03"): 9, }
def make_dataset(cls) -> timeseries.MultiRegionDataset: """Default implementation of make_dataset that loads data from the parquet file.""" assert cls.VARIABLES ccd_dataset = CanScraperBase._get_covid_county_dataset() data, source_urls_df = ccd_dataset.query_multiple_variables( cls.VARIABLES, log_provider_coverage_warnings=True) data = cls.transform_data(data) data = cls._check_data(data) ds = MultiRegionDataset.from_fips_timeseries_df( data).add_provenance_all(cls.SOURCE_NAME) if not source_urls_df.empty: # For each FIPS-VARIABLE pair keep the source_url row with the last DATE. source_urls_df = (source_urls_df.sort_values( CommonFields.DATE).groupby( [CommonFields.FIPS, PdFields.VARIABLE], sort=False).last().reset_index().drop( columns=[CommonFields.DATE])) source_urls_df[taglib.TagField.TYPE] = taglib.TagType.SOURCE_URL ds = ds.append_fips_tag_df(source_urls_df) return ds
def _make_output_dataset( dataset_in: MultiRegionDataset, source_columns: Collection[FieldName], wide_date_df: pd.DataFrame, output_metric: FieldName, ) -> MultiRegionDataset: """Returns a dataset with `wide_date_df` in a column named `output_metric` and provenance information copied from `dataset_in`. Args: dataset_in: original MultiRegionDataset. source_columns: columns that were a source for the output. tags are copied from these to the output. wide_date_df: the timeseries output, copied to the returned dataset output_metric: wide_date_df is copied to this metric/column in the returned dataset """ assert wide_date_df.index.names == [CommonFields.LOCATION_ID] # Drop all-NA timeseries now, as done in from_timeseries_wide_dates_df. This makes sure # `locations` is used to build provenance information for only timeseries in the returned # MultiRegionDataset. wide_date_df = wide_date_df.dropna("rows", "all") locations = wide_date_df.index.get_level_values(CommonFields.LOCATION_ID) _append_variable_index_level(wide_date_df, output_metric) assert dataset_in.tag.index.names == [ TagField.LOCATION_ID, TagField.VARIABLE, TagField.TYPE, ] dataset_out = MultiRegionDataset.from_timeseries_wide_dates_df( wide_date_df) if source_columns: source_tags = dataset_in.tag.loc[locations, list(source_columns)].reset_index() source_tags[TagField.VARIABLE] = output_metric # When there are two source_columns they usually contain the same provenance content. # Only keep one copy of it. output_tags = source_tags.drop_duplicates(ignore_index=True) dataset_out = dataset_out.append_tag_df(output_tags) return dataset_out
def run(dataset: timeseries.MultiRegionDataset, config: Config = CONFIG) -> timeseries.MultiRegionDataset: for filter_ in config.filters: filtered_dataset, passed_dataset = dataset.partition_by_region( filter_.regions_included, exclude=filter_.regions_excluded) if filtered_dataset.location_ids.empty: # TODO(tom): Find a cleaner way to refer to a filter in logs. _logger.info("No locations matched", regions=str(filter_.regions_included)) continue if filter_.drop_observations: filtered_dataset = drop_observations(filtered_dataset, filter_) else: ts_selected_fields, _ = _partition_by_fields( filtered_dataset.timeseries_bucketed_wide_dates, filter_.fields_included) filtered_dataset = filtered_dataset.add_tag_to_subset( filter_.tag, ts_selected_fields.index) dataset = filtered_dataset.append_regions(passed_dataset) return dataset
def calculate(self, dataset: MultiRegionDataset, diff_days: int, most_recent_date: pd.Timestamp) -> MethodOutput: delta_df = (dataset.timeseries_wide_dates().reorder_levels( [PdFields.VARIABLE, CommonFields.LOCATION_ID]).diff(periods=diff_days, axis=1)) assert delta_df.columns.names == [CommonFields.DATE] assert delta_df.index.names == [ PdFields.VARIABLE, CommonFields.LOCATION_ID ] # delta_df has the field name as the first level of the index. delta_df.loc[field, :] returns a # DataFrame without the field label so operators such as `/` are calculated for each # region/state and date. wide_date_df = delta_df.loc[self._numerator, :] / delta_df.loc[ self._denominator, :] all_output = _make_output_dataset( dataset, self.columns, wide_date_df, CommonFields.TEST_POSITIVITY, ) return self._remove_stale_regions(all_output, most_recent_date)
def drop_all_zero_timeseries( ds_in: timeseries.MultiRegionDataset, fields: Collection[CommonFields]) -> timeseries.MultiRegionDataset: """Returns a dataset with `fields` timeseries dropped if they contain only NA and 0. When first built this is dropping a timeseries in Loving County, TX which has so few people that the all-zero timeseries is likely accurate. It may be worth only applying this to locations with a population over some threshold. Or perhaps an automatic filter isn't worth the trouble after all :-( """ ts_wide = ds_in.timeseries_wide_dates() # Separate into timeseries in `fields` and all others. variable_mask = ts_wide.index.get_level_values( PdFields.VARIABLE).isin(fields) ts_wide_other_variables = ts_wide.loc[~variable_mask] ts_wide_variables = ts_wide.loc[variable_mask] # Keep rows/timeseries that have at least one value that is not 0 or NA to_keep_mask = ts_wide_variables.replace(pd.NA, 0).any(axis=1) to_drop = ts_wide_variables.loc[~to_keep_mask].index if not to_drop.empty: # Maybe add filtering to not log about the known bad data in OH counties and Loving # County Texas using a RegionMask(level=County, state=OH) and some kind of RegionMask # representing counties with a small population. _log.info(DROPPING_TIMESERIES_WITH_ONLY_ZEROS, dropped=to_drop) ts_wide_kept = ts_wide_variables.loc[to_keep_mask] ts_wide_out = pd.concat([ts_wide_kept, ts_wide_other_variables]) # Make a new dataset without the dropped timeseries. This does not drop the tags of the # dropped timeseries but keeping the provenance tags doesn't seem to be a problem. Maybe it'd # be cleaner to add a method 'MultiRegionDataset.drop_timeseries' similar to 'remove_regions' or # move this into 'MultiRegionDataset' similar to 'drop_stale_timeseries'. return dataclasses.replace(ds_in, timeseries=ts_wide_out.stack().unstack( PdFields.VARIABLE).sort_index())