def test_make_latest_from_timeseries_simple(): data = read_csv_and_index_fips_date( "fips,county,state,country,date,aggregate_level,m1,m2\n" "97123,Smith County,ZZ,USA,2020-04-01,county,1,\n" "97123,Smith County,ZZ,USA,2020-04-02,county,,2\n").reset_index() ts = TimeseriesDataset(data) assert to_dict(["fips"], ts.latest_values()[["fips", "m1", "m2"]]) == { "97123": { "m1": 1, "m2": 2 } }
def test_get_subset(): # CSV with a unique FIPS value for every region, even countries. In production countries are removed before # TimeseriesDataset is created. A future change may replace FIPS with a more general identifier. input_df = pd.read_csv( StringIO( "city,county,state,fips,country,aggregate_level,date,metric\n" "Smithville,,ZZ,97123,USA,city,2020-03-23,smithville-march23\n" "New York City,,ZZ,97324,USA,city,2020-03-22,march22-nyc\n" "New York City,,ZZ,97324,USA,city,2020-03-24,march24-nyc\n" ",North County,ZZ,97001,USA,county,2020-03-23,county-metric\n" ",,ZZ,97,USA,state,2020-03-23,mystate\n" ",,XY,96,USA,state,2020-03-23,other-state\n" ",,,iso2:uk,UK,country,2020-03-23,you-kee\n" ",,,iso2:us,US,country,2020-03-23,you-ess-hey\n")) ts = TimeseriesDataset(input_df) assert set(ts.get_subset(AggregationLevel.COUNTRY).data["metric"]) == { "you-kee", "you-ess-hey" } assert set( ts.get_subset(AggregationLevel.COUNTRY, country="UK").data["country"]) == {"UK"} assert set(ts.get_subset( AggregationLevel.STATE).data["metric"]) == {"mystate", "other-state"} assert set(ts.get_subset( state="ZZ", after="2020-03-23").data["metric"]) == {"march24-nyc"} assert set(ts.get_subset(state="ZZ", after="2020-03-22").data["metric"]) == { "smithville-march23", "county-metric", "mystate", "march24-nyc", } assert set( ts.get_subset(AggregationLevel.STATE, states=["ZZ", "XY"]).data["metric"]) == { "mystate", "other-state", } assert set(ts.get_subset(states=["ZZ"], on="2020-03-23").data["metric"]) == { "smithville-march23", "county-metric", "mystate", } assert set( ts.get_subset(states=["ZZ"], before="2020-03-23").data["metric"]) == {"march22-nyc"}
def generate_region_timeseries( region_summary: RegionSummary, timeseries: TimeseriesDataset, model_output: Optional[CANPyseirLocationOutput], ) -> RegionSummaryWithTimeseries: if not region_summary.intervention: # All region summaries here are expected to have actuals values. # It's a bit unclear why the actuals value is optional in the first place, # but at this point we expect actuals to have been included. raise AssertionError("Region summary missing actuals") actuals_timeseries = [] for row in timeseries.yield_records(): # Timeseries records don't have population row[CommonFields.POPULATION] = region_summary.population actual = _generate_actuals(row, region_summary.intervention) timeseries_row = ActualsTimeseriesRow(**actual.dict(), date=row[CommonFields.DATE]) actuals_timeseries.append(timeseries_row) model_timeseries = [] if model_output: model_timeseries = [ _generate_prediction_timeseries_row(row) for row in model_output.data.to_dict(orient="records") ] region_summary_data = { key: getattr(region_summary, key) for (key, _) in region_summary } return RegionSummaryWithTimeseries(**region_summary_data, timeseries=model_timeseries, actualsTimeseries=actuals_timeseries)
def _fips_csv_to_one_region(csv_str: str, region: Region) -> OneRegionTimeseriesDataset: # Make a Timeseries first because it can have a FIPS column without location_id ts = TimeseriesDataset.load_csv(io.StringIO(csv_str)) # from_timeseries_and_latest adds the location_id column needed by get_one_region return MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, ts.latest_values_object()).get_one_region(region)
def test_aggregate(): ts = TimeseriesDataset.load_csv( io.StringIO("fips,state,aggregate_level,county,m1,date,foo\n" "55005,ZZ,county,North County,1,2020-05-01,ab\n" "55005,ZZ,county,North County,2,2020-05-02,cd\n" "55005,ZZ,county,North County,3,2020-05-03,ef\n" "55006,ZZ,county,South County,3,2020-05-03,ef\n" "55006,ZZ,county,South County,4,2020-05-04,gh\n" "55,ZZ,state,Grand State,41,2020-05-01,ij\n" "55,ZZ,state,Grand State,43,2020-05-03,kl\n")) ts_in = MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, ts.latest_values_object()) agg = statistical_areas.CountyToCBSAAggregator( county_map={ "55005": "10001", "55006": "10001" }, cbsa_title_map={"10001": "Stat Area 1"}) ts_out = agg.aggregate(ts_in) assert ts_out.groupby_region().ngroups == 1 ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10001")) assert ts_cbsa.date_indexed["m1"].to_dict() == { pd.to_datetime("2020-05-01"): 1, pd.to_datetime("2020-05-02"): 2, pd.to_datetime("2020-05-03"): 6, pd.to_datetime("2020-05-04"): 4, }
def get_timeseries_for_state( state: str, columns: List = None, min_range_with_some_value: bool = False) -> TimeseriesDataset: """Gets timeseries for a specific state abbreviation. Args: state: 2-letter state code columns: List of columns, apart from `TimeseriesDataset.INDEX_FIELDS`, to include. min_range_with_some_value: If True, removes NaNs that pad values at beginning and end of timeseries. Only applicable when columns are specified. Returns: Timeseries for state """ state_ts = load_us_timeseries_dataset().get_subset(AggregationLevel.STATE, state=state) if columns: subset = state_ts.data.loc[:, TimeseriesDataset.INDEX_FIELDS + columns].reset_index(drop=True) if min_range_with_some_value: subset = _remove_padded_nans(subset, columns) state_ts = TimeseriesDataset(subset) return state_ts
def get_timeseries_for_fips( fips: str, columns: List = None, min_range_with_some_value: bool = False) -> TimeseriesDataset: """Gets timeseries for a specific FIPS code. Args: fips: FIPS code. Can be county (5 character) or state (2 character) code. columns: List of columns, apart from `TimeseriesDataset.INDEX_FIELDS`, to include. min_range_with_some_value: If True, removes NaNs that pad values at beginning and end of timeseries. Only applicable when columns are specified. Returns: Timeseries for fips """ state_ts = load_us_timeseries_dataset().get_subset(None, fips=fips) if columns: subset = state_ts.data.loc[:, TimeseriesDataset.INDEX_FIELDS + columns].reset_index(drop=True) if min_range_with_some_value: subset = _remove_padded_nans(subset, columns) state_ts = TimeseriesDataset(subset) return state_ts
def test_unique_timeseries(data_source_cls): data_source = data_source_cls.local() timeseries = TimeseriesDataset.build_from_data_source(data_source) timeseries = combined_datasets.US_STATES_FILTER.apply(timeseries) timeseries_data = timeseries.data.set_index(timeseries.INDEX_FIELDS) duplicates = timeseries_data.index.duplicated() assert not sum(duplicates)
def timeseries(self) -> TimeseriesDataset: """Build TimeseriesDataset from this data source.""" if set(self.INDEX_FIELD_MAP.keys()) != set( TimeseriesDataset.INDEX_FIELDS): raise ValueError("Index fields must match") return TimeseriesDataset.from_source( self, fill_missing_state=self.FILL_MISSING_STATE_LEVEL_DATA)
def test_get_subset_and_get_data(): input_df = pd.read_csv( StringIO( "city,county,state,fips,country,aggregate_level,date,metric\n" "Smithville,,ZZ,97123,USA,city,2020-03-23,smithville-march23\n" "New York City,,ZZ,97324,USA,city,2020-03-22,march22-nyc\n" "New York City,,ZZ,97324,USA,city,2020-03-24,march24-nyc\n" ",North County,ZZ,97001,USA,county,2020-03-23,county-metric\n" ",,ZZ,97001,USA,state,2020-03-23,mystate\n" ",,XY,96001,USA,state,2020-03-23,other-state\n" ",,,,UK,country,2020-03-23,you-kee\n" ",,,,US,country,2020-03-23,you-ess-hey\n")) ts = TimeseriesDataset(input_df) assert set(ts.get_subset(AggregationLevel.COUNTRY).data["metric"]) == { "you-kee", "you-ess-hey" } assert set( ts.get_subset(AggregationLevel.COUNTRY, country="UK").data["country"]) == {"UK"} assert set(ts.get_subset( AggregationLevel.STATE).data["metric"]) == {"mystate", "other-state"} assert set(ts.get_data(None, state="ZZ", after="2020-03-23")["metric"]) == {"march24-nyc"} assert set(ts.get_data(None, state="ZZ", after="2020-03-22")["metric"]) == { "smithville-march23", "county-metric", "mystate", "march24-nyc", } assert set( ts.get_data(AggregationLevel.STATE, states=["ZZ", "XY"])["metric"]) == { "mystate", "other-state", } assert set(ts.get_data(None, states=["ZZ"], on="2020-03-23")["metric"]) == { "smithville-march23", "county-metric", "mystate", } assert set( ts.get_data(None, states=["ZZ"], before="2020-03-23")["metric"]) == {"march22-nyc"}
def test_unique_timeseries(data_source_cls): data_source = data_source_cls.local() timeseries = TimeseriesDataset.build_from_data_source(data_source) timeseries = combined_datasets.US_STATES_FILTER.apply(timeseries) # Check for duplicate rows with the same INDEX_FIELDS. Sort by index so duplicates are next to # each other in the message if the assert fails. timeseries_data = timeseries.data.set_index(timeseries.INDEX_FIELDS).sort_index() duplicates = timeseries_data.index.duplicated(keep=False) assert not sum(duplicates), str(timeseries_data.loc[duplicates])
def load_combined_timeseries( sources: Dict[str, TimeseriesDataset], timeseries: TimeseriesDataset) -> TimeseriesDataset: timeseries_data = timeseries.data.copy() timeseries_data["source"] = "Combined Data" combined_timeseries = TimeseriesDataset( pd.concat([timeseries_data] + [source.data for source in sources.values()])) return combined_timeseries
def test_write_csv(): df = pd.DataFrame({ CommonFields.DATE: pd.to_datetime(["2020-04-01", "2020-04-02"]), CommonFields.FIPS: ["06045", "45123"], CommonFields.CASES: [234, 456], }) ts = TimeseriesDataset(df) expected_csv = """,,summary,summary,summary,summary,summary,summary,summary,summary,summary,value,value date,,has_value,min_date,max_date,max_value,min_value,latest_value,num_observations,largest_delta,largest_delta_date,2020-04-01 00:00:00,2020-04-02 00:00:00 fips,variable,,,,,,,,,,, 06045,cases,True,2020-04-01,2020-04-01,234,234,234,1,,,234, 45123,cases,True,2020-04-02,2020-04-02,456,456,456,1,,,,456 """ # Call common_df.write_csv with index set to ["fips", "date"], the expected normal index. with temppathlib.NamedTemporaryFile("w+") as tmp: wide_dates_df.write_csv(ts.get_date_columns(), tmp.path) assert expected_csv == tmp.file.read()
def latest_case_summaries_by_state( dataset: TimeseriesDataset) -> Iterator[StateCaseSummary]: """Builds summary of latest case data by state and county. Data is generated for the embeds which expects a list of records in this format: { "state": <state>, "date": "YYYY-MM-DD", "cases": <cases>, "deaths": <deaths>, "counties": [ {"fips": <fips code>, "cases": <cases>, "deaths": <deaths", "date": <date>} ] } Args: data: Timeseries object. Returns: List of data. """ dataset = dataset.get_subset(None, country="USA") latest_state = dataset.latest_values(AggregationLevel.STATE) latest_county = dataset.latest_values(AggregationLevel.COUNTY) latest_state["date"] = latest_state["date"].dt.strftime("%Y-%m-%d") latest_county["date"] = latest_county["date"].dt.strftime("%Y-%m-%d") states = latest_state[STATE_EXPORT_FIELDS].to_dict(orient="records") for state_data in states: state = state_data["state"] if len(state) != 2: _logger.info(f"Skipping state {state}") continue county_data = latest_county[latest_county.state == state] counties = county_data[COUNTY_EXPORT_FIELDS].to_dict(orient="records") state_data.update({"counties": counties}) yield StateCaseSummary(**state_data)
def test_wide_dates(): input_df = read_csv_and_index_fips_date( "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-01,1,\n" "97111,Bar County,county,2020-04-02,2,\n" "97222,Foo County,county,2020-04-01,,10\n" "97222,Foo County,county,2020-04-03,3,30\n") provenance = provenance_wide_metrics_to_series( read_csv_and_index_fips_date("fips,date,m1,m2\n" "97111,2020-04-01,src11,\n" "97111,2020-04-02,src11,\n" "97222,2020-04-01,,src22\n" "97222,2020-04-03,src21,src22\n"), structlog.get_logger(), ) ts = TimeseriesDataset(input_df.reset_index(), provenance=provenance) date_columns = ts.get_date_columns() assert to_dict(["fips", "variable"], date_columns["value"]) == { ("97111", "m1"): { pd.to_datetime("2020-04-01"): 1.0, pd.to_datetime("2020-04-02"): 2.0 }, ("97222", "m1"): { pd.to_datetime("2020-04-03"): 3.0 }, ("97222", "m2"): { pd.to_datetime("2020-04-01"): 10.0, pd.to_datetime("2020-04-03"): 30.0 }, } assert to_dict(["fips", "variable"], date_columns["provenance"]) == { ("97111", "m1"): { "value": "src11" }, ("97222", "m1"): { "value": "src21" }, ("97222", "m2"): { "value": "src22" }, }
def build_from_data_source(cls, source): from libs.datasets.timeseries import TimeseriesDataset if set(source.INDEX_FIELD_MAP.keys()) == set(TimeseriesDataset.INDEX_FIELDS): timeseries = TimeseriesDataset.build_from_data_source(source) return timeseries.to_latest_values_dataset() if set(source.INDEX_FIELD_MAP.keys()) != set(cls.INDEX_FIELDS): raise ValueError("Index fields must match") return cls.from_source(source)
def test_make_latest_from_timeseries_dont_touch_county(): data = read_csv_and_index_fips_date( "fips,county,state,country,date,aggregate_level,m1,m2\n" "95123,Smith Countyy,YY,USA,2020-04-01,county,1,\n" "97123,Smith Countzz,ZZ,USA,2020-04-01,county,2,\n" "97,,ZZ,USA,2020-04-01,state,3,\n").reset_index() ts = TimeseriesDataset(data) assert to_dict(["fips"], ts.latest_values()[["fips", "county", "m1", "m2"]]) == { "95123": { "m1": 1, "county": "Smith Countyy" }, "97123": { "m1": 2, "county": "Smith Countzz" }, "97": { "m1": 3 }, }
def get_hospitalization_data(): data = combined_datasets.build_us_timeseries_with_all_fields().data # Since we're using this data for hospitalized data only, only returning # values with hospitalization data. I think as the use cases of this data source # expand, we may not want to drop. For context, as of 4/8 607/1821 rows contained # hospitalization data. has_current_hospital = data[ TimeseriesDataset.Fields.CURRENT_HOSPITALIZED].notnull() has_cumulative_hospital = data[ TimeseriesDataset.Fields.CUMULATIVE_HOSPITALIZED].notnull() return TimeseriesDataset(data[has_current_hospital | has_cumulative_hospital])
def test_persist_and_load_dataset(tmp_path, nyc_fips): region = Region.from_fips(nyc_fips) dataset = combined_datasets.load_us_timeseries_dataset() timeseries_nyc = TimeseriesDataset(dataset.get_one_region(region).data) pointer = combined_dataset_utils.persist_dataset(timeseries_nyc, tmp_path) downloaded_dataset = pointer.load_dataset() differ_l = DatasetDiff.make(downloaded_dataset.data) differ_r = DatasetDiff.make(timeseries_nyc.data) differ_l.compare(differ_r) assert not len(differ_l.my_ts)
def build_from_data_source(cls, source): from libs.datasets.timeseries import TimeseriesDataset if set(source.INDEX_FIELD_MAP.keys()) == set( TimeseriesDataset.INDEX_FIELDS): timeseries = TimeseriesDataset.build_from_data_source(source) return timeseries.to_latest_values_dataset() if set(source.INDEX_FIELD_MAP.keys()) != set(cls.INDEX_FIELDS): raise ValueError("Index fields must match") return cls.from_source( source, fill_missing_state=source.FILL_MISSING_STATE_LEVEL_DATA)
def test_summarize_timeseries_fields_with_some_real_data(): data_source = CovidCountyDataDataSource.local() ts = TimeseriesDataset.from_source(data_source) summary = summarize_timeseries_fields( ts.data.loc[lambda df: df[CommonFields.FIPS].str.startswith("06")]) assert not summary.empty cases_summary = summary.loc[("06025", "cases"), :] assert summary.loc[("06025", "cases"), "max_value"] > 7000 assert summary.loc[("06025", "cases"), "max_date"] > pd.to_datetime("2020-08-01") assert summary.loc[("06025", "cases"), "largest_delta_date"] > pd.to_datetime("2020-04-01") assert cases_summary["has_value"] == True assert cases_summary["num_observations"] > 100
def get_hospitalization_data(): """ Since we're using this data for hospitalized data only, only returning values with hospitalization data. I think as the use cases of this data source expand, we may not want to drop. For context, as of 4/8 607/1821 rows contained hospitalization data. Returns ------- TimeseriesDataset """ data = combined_datasets.load_us_timeseries_dataset().data has_current_hospital = data[CommonFields.CURRENT_HOSPITALIZED].notnull() has_cumulative_hospital = data[ CommonFields.CUMULATIVE_HOSPITALIZED].notnull() return TimeseriesDataset(data[has_current_hospital | has_cumulative_hospital])
def test_load_from_local_public_data(): agg = statistical_areas.CountyToCBSAAggregator.from_local_public_data() assert agg.cbsa_title_map["43580"] == "Sioux City, IA-NE-SD" assert agg.county_map["48187"] == "41700" ts = TimeseriesDataset.load_csv( io.StringIO("fips,state,aggregate_level,county,m1,date,foo\n" "48059,ZZ,county,North County,3,2020-05-03,ef\n" "48253,ZZ,county,South County,4,2020-05-03,ef\n")) ts_in = MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, ts.latest_values_object()) ts_out = agg.aggregate(ts_in) ts_cbsa = ts_out.get_one_region(Region.from_cbsa_code("10180")) assert ts_cbsa.date_indexed["m1"].to_dict() == { pd.to_datetime("2020-05-03"): 7, }
def _write_pipeline_output( pipelines: List[Union[SubStatePipeline, StatePipeline]], output_dir: str, output_interval_days: int = 4, write_webui_output: bool = False, ): infection_rate_metric_df = pd.concat((p.infer_df for p in pipelines), ignore_index=True) # TODO: Use constructors in MultiRegionTimeseriesDataset timeseries_dataset = TimeseriesDataset(infection_rate_metric_df) latest = timeseries_dataset.latest_values_object() multiregion_rt = MultiRegionTimeseriesDataset.from_timeseries_and_latest( timeseries_dataset, latest) output_path = pathlib.Path( output_dir) / pyseir.utils.SummaryArtifact.RT_METRIC_COMBINED.value multiregion_rt.to_csv(output_path) root.info(f"Saving Rt results to {output_path}") icu_df = pd.concat((p.icu_data.data for p in pipelines if p.icu_data), ignore_index=True) timeseries_dataset = TimeseriesDataset(icu_df) latest = timeseries_dataset.latest_values_object().data.set_index( CommonFields.LOCATION_ID) multiregion_icu = MultiRegionTimeseriesDataset(icu_df, latest) output_path = pathlib.Path( output_dir) / pyseir.utils.SummaryArtifact.ICU_METRIC_COMBINED.value multiregion_icu.to_csv(output_path) root.info(f"Saving ICU results to {output_path}") if write_webui_output: # does not parallelize well, because web_ui mapper doesn't serialize efficiently # TODO: Remove intermediate artifacts and paralellize artifacts creation better # Approximately 40% of the processing time is taken on this step web_ui_mapper = WebUIDataAdaptorV1( output_interval_days=output_interval_days, output_dir=output_dir, ) webui_inputs = [ webui_data_adaptor_v1.RegionalInput.from_results( p.fitter, p.ensemble, p.infer_df) for p in pipelines if p.fitter ] with Pool(maxtasksperchild=1) as p: p.map(web_ui_mapper.write_region_safely, webui_inputs)
def test_expected_field_in_sources(data_source_cls): data_source = data_source_cls.local() ts = TimeseriesDataset.from_source(data_source) # Extract the USA data from the raw DF. Replace this with cleaner access when the DataSource makes it easy. rename_columns = {source: common for common, source in data_source.all_fields_map().items()} renamed_data = data_source.data.rename(columns=rename_columns) usa_data = renamed_data.loc[renamed_data["country"] == "USA"] assert not usa_data.empty states = set(usa_data["state"]) if data_source.SOURCE_NAME == "NHA": assert states == {"NV"} else: good_state = set() for state in states: if re.fullmatch(r"[A-Z]{2}", state): good_state.add(state) else: logging.info(f"Ignoring {state} in {data_source.SOURCE_NAME}") assert len(good_state) >= 48
def timeseries(self) -> TimeseriesDataset: """Builds generic beds dataset""" return TimeseriesDataset.build_from_data_source(self)
def timeseries(self) -> "TimeseriesDataset": """Builds generic beds dataset""" return TimeseriesDataset.from_source(self)