def test_multiregion_provenance(): input_df = read_csv_and_index_fips_date( "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-01,1,\n" "97111,Bar County,county,2020-04-02,2,\n" "97222,Foo County,county,2020-04-01,,10\n" "97222,Foo County,county,2020-04-03,3,30\n" "03,,state,2020-04-03,4,40\n" ) provenance = combined_datasets.provenance_wide_metrics_to_series( read_csv_and_index_fips_date( "fips,date,m1,m2\n" "97111,2020-04-01,src11,\n" "97111,2020-04-02,src11,\n" "97222,2020-04-01,,src22\n" "97222,2020-04-03,src21,src22\n" "03,2020-04-03,src31,src32\n" ), structlog.get_logger(), ) ts = timeseries.TimeseriesDataset(input_df.reset_index(), provenance=provenance) out = timeseries.MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, ts.latest_values_object() ) # Use loc[...].at[...] as work-around for https://github.com/pandas-dev/pandas/issues/26989 assert out.provenance.loc["iso1:us#fips:97111"].at["m1"] == "src11" assert out.provenance.loc["iso1:us#fips:97222"].at["m2"] == "src22" assert out.provenance.loc["iso1:us#fips:03"].at["m2"] == "src32" counties = out.get_counties(after=pd.to_datetime("2020-04-01")) assert "iso1:us#fips:03" not in counties.provenance.index assert counties.provenance.loc["iso1:us#fips:97222"].at["m1"] == "src21"
def test_one_region_dataset(): ts = timeseries.OneRegionTimeseriesDataset( read_csv_and_index_fips_date( "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-02,2,\n" ).reset_index(), {}, ) assert ts.has_one_region() == True with pytest.raises(ValueError): timeseries.OneRegionTimeseriesDataset( read_csv_and_index_fips_date( "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-02,2,\n" "97222,Foo County,county,2020-04-01,,10\n" ).reset_index(), {}, ) with structlog.testing.capture_logs() as logs: ts = timeseries.OneRegionTimeseriesDataset( read_csv_and_index_fips_date("fips,county,aggregate_level,date,m1,m2\n").reset_index(), {}, ) assert [l["event"] for l in logs] == ["Creating OneRegionTimeseriesDataset with zero regions"] assert ts.empty
def test_build_timeseries(): data_a = read_csv_and_index_fips_date( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,1\n" ) data_b = read_csv_and_index_fips_date( "county,state,fips,country,aggregate_level,date,cases\n" "Jones County,ZZ,97123,USA,county,2020-04-01,2\n" ) datasets = {"source_a": data_a, "source_b": data_b} combined = _build_dataframe({"cases": ["source_a", "source_b"]}, datasets) assert combined.at[("97123", "2020-04-01"), "cases"] == 2 combined = _build_dataframe({"cases": ["source_b", "source_a"]}, datasets) assert combined.at[("97123", "2020-04-01"), "cases"] == 1
def test_multi_region_to_from_timeseries_and_latest_values(tmp_path: pathlib.Path): ts = timeseries.TimeseriesDataset( read_csv_and_index_fips_date( "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-02,2,\n" "97222,Foo County,county,2020-04-01,,10\n" "01,,state,2020-04-01,,20\n" ).reset_index() ) latest_values = timeseries.LatestValuesDataset( read_csv_and_index_fips( "fips,county,aggregate_level,c1,c2\n" "97111,Bar County,county,3,\n" "97222,Foo County,county,4,10.5\n" "01,,state,,123.4\n" ).reset_index() ) multiregion = timeseries.MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, latest_values ) region_97111 = multiregion.get_one_region(Region.from_fips("97111")) assert region_97111.date_indexed.at["2020-04-02", "m1"] == 2 assert region_97111.latest["c1"] == 3 assert multiregion.get_one_region(Region.from_fips("01")).latest["c2"] == 123.4 csv_path = tmp_path / "multiregion.csv" multiregion.to_csv(csv_path) multiregion_loaded = timeseries.MultiRegionTimeseriesDataset.from_csv(csv_path) region_97111 = multiregion_loaded.get_one_region(Region.from_fips("97111")) assert region_97111.date_indexed.at["2020-04-02", "m1"] == 2 assert region_97111.latest["c1"] == 3 assert multiregion_loaded.get_one_region(Region.from_fips("01")).latest["c2"] == 123.4
def test_fix_missing_pos(): df = read_csv_and_index_fips_date( "fips,date,negative_tests,positive_tests,total_tests,cases\n" "97123,2020-04-01,9,1,10,1\n" "97123,2020-04-02,17,,20,2\n" "97123,2020-04-03,26,4,30,4\n").reset_index() result_df, provenance = CovidCountyDataDataSource.synthesize_test_metrics( df) assert to_dict([CommonFields.FIPS, CommonFields.DATE], result_df) == { ("97123", pd.to_datetime("2020-04-01")): { CommonFields.NEGATIVE_TESTS: 9, CommonFields.POSITIVE_TESTS: 1, CommonFields.TOTAL_TESTS: 10, CommonFields.CASES: 1, }, ("97123", pd.to_datetime("2020-04-02")): { CommonFields.NEGATIVE_TESTS: 17, CommonFields.POSITIVE_TESTS: 3, CommonFields.TOTAL_TESTS: 20, CommonFields.CASES: 2, }, ("97123", pd.to_datetime("2020-04-03")): { CommonFields.NEGATIVE_TESTS: 26, CommonFields.POSITIVE_TESTS: 4, CommonFields.TOTAL_TESTS: 30, CommonFields.CASES: 4, }, } assert provenance.to_dict() == { ("97123", CommonFields.NEGATIVE_TESTS): "none", ("97123", CommonFields.POSITIVE_TESTS): "none;missing_pos", }
def test_wide_dates(): input_df = read_csv_and_index_fips_date( "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-01,1,\n" "97111,Bar County,county,2020-04-02,2,\n" "97222,Foo County,county,2020-04-01,,10\n" "97222,Foo County,county,2020-04-03,3,30\n") provenance = provenance_wide_metrics_to_series( read_csv_and_index_fips_date("fips,date,m1,m2\n" "97111,2020-04-01,src11,\n" "97111,2020-04-02,src11,\n" "97222,2020-04-01,,src22\n" "97222,2020-04-03,src21,src22\n"), structlog.get_logger(), ) ts = TimeseriesDataset(input_df.reset_index(), provenance=provenance) date_columns = ts.get_date_columns() assert to_dict(["fips", "variable"], date_columns["value"]) == { ("97111", "m1"): { pd.to_datetime("2020-04-01"): 1.0, pd.to_datetime("2020-04-02"): 2.0 }, ("97222", "m1"): { pd.to_datetime("2020-04-03"): 3.0 }, ("97222", "m2"): { pd.to_datetime("2020-04-01"): 10.0, pd.to_datetime("2020-04-03"): 30.0 }, } assert to_dict(["fips", "variable"], date_columns["provenance"]) == { ("97111", "m1"): { "value": "src11" }, ("97222", "m1"): { "value": "src21" }, ("97222", "m2"): { "value": "src22" }, }
def test_build_timeseries_override(): data_a = read_csv_and_index_fips_date( "fips,date,m1,m2\n" "97123,2020-04-01,1,\n" "97123,2020-04-02,,\n" "97123,2020-04-03,3,3" ) data_b = read_csv_and_index_fips_date( "fips,date,m1,m2\n" "97123,2020-04-01,,\n" "97123,2020-04-02,2,\n" ) datasets = {"source_a": data_a, "source_b": data_b} # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1 combined = _build_dataframe( {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_TIMESERIES ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [None, 2, None] # The combined m1 timeseries is the highest priority real value for each date, a blend of source_a and source_b. combined = _build_dataframe( {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_TIMESERIES_POINT ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, 2, 3] # The combined m1 timeseries is the highest priority value for each date; source_b is higher priority for # both 2020-04-01 and 2020-04-02. combined = _build_dataframe( {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_ROW ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [None, 2, 3] combined = _build_dataframe( {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_TIMESERIES ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, None, 3] combined = _build_dataframe( {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_TIMESERIES_POINT ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, 2, 3] combined = _build_dataframe( {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_ROW ) assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, None, 3]
def test_build_and_and_provenance_missing_fips(): data_a = read_csv_and_index_fips_date("fips,date,m1,m2\n" "97111,2020-04-01,1,\n" "97111,2020-04-02,,\n" "97111,2020-04-03,3,3\n") data_b = read_csv_and_index_fips_date("fips,date,m1,m2\n" "97111,2020-04-01,,\n" "97111,2020-04-02,2,\n" "97444,2020-04-04,4,\n") datasets = {"source_a": data_a, "source_b": data_b} # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1 combined, provenance = _build_data_and_provenance( { "m1": ["source_a", "source_b"], "m2": ["source_a", "source_b"] }, datasets) assert combined.loc["97444", "m1"].dropna().tolist() == [4] assert provenance.loc["97444", "m1"].dropna().tolist() == ["source_b"] assert combined.loc["97444", "m2"].dropna().tolist() == [] assert provenance.loc["97444", "m2"].dropna().tolist() == []
def test_build_timeseries_override(): data_a = read_csv_and_index_fips_date("fips,date,m1,m2\n" "97123,2020-04-01,1,\n" "97123,2020-04-02,,\n" "97123,2020-04-03,3,3") data_b = read_csv_and_index_fips_date("fips,date,m1,m2\n" "97123,2020-04-01,,\n" "97123,2020-04-02,2,\n") datasets = {"source_a": data_a, "source_b": data_b} # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1 combined, provenance = _build_data_and_provenance( {"m1": ["source_a", "source_b"]}, datasets, ) assert combined.loc["97123", "m1"].replace({ np.nan: None }).tolist() == [None, 2, None] assert provenance.loc["97123", "m1"].replace({ np.nan: None }).tolist() == [ None, "source_b", None, ] # The combined m1 timeseries is the highest priority value for each date; source_b is higher priority for # both 2020-04-01 and 2020-04-02. combined, provenance = _build_data_and_provenance( {"m1": ["source_b", "source_a"]}, datasets) assert combined.loc["97123", "m1"].replace({ np.nan: None }).tolist() == [1, None, 3] assert provenance.loc["97123", "m1"].replace({ np.nan: None }).tolist() == [ "source_a", None, "source_a", ]
def test_make_latest_from_timeseries_simple(): data = read_csv_and_index_fips_date( "fips,county,state,country,date,aggregate_level,m1,m2\n" "97123,Smith County,ZZ,USA,2020-04-01,county,1,\n" "97123,Smith County,ZZ,USA,2020-04-02,county,,2\n").reset_index() ts = TimeseriesDataset(data) assert to_dict(["fips"], ts.latest_values()[["fips", "m1", "m2"]]) == { "97123": { "m1": 1, "m2": 2 } }
def test_melt_provenance_multiple_sources(): wide = read_csv_and_index_fips_date("fips,date,cases,recovered\n" "97111,2020-04-01,source_a,source_b\n" "97111,2020-04-02,source_x,\n" "97222,2020-04-01,source_c,\n") with structlog.testing.capture_logs() as logs: long = provenance_wide_metrics_to_series(wide, structlog.get_logger()) assert [l["event"] for l in logs] == ["Multiple rows for a timeseries"] assert long.to_dict() == { ("97111", "cases"): "source_a;source_x", ("97111", "recovered"): "source_b", ("97222", "cases"): "source_c", }
def test_melt_provenance(): wide = read_csv_and_index_fips_date("fips,date,cases,recovered\n" "97111,2020-04-01,source_a,source_b\n" "97111,2020-04-02,source_a,\n" "97222,2020-04-01,source_c,\n") with structlog.testing.capture_logs() as logs: long = provenance_wide_metrics_to_series(wide, structlog.get_logger()) assert logs == [] assert long.to_dict() == { ("97111", "cases"): "source_a", ("97111", "recovered"): "source_b", ("97222", "cases"): "source_c", }
def test_multi_region_to_from_timeseries(): ts = timeseries.TimeseriesDataset( read_csv_and_index_fips_date( "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-02,2,\n" "97222,Foo County,county,2020-04-01,,10\n" "01,,state,2020-04-01,,20\n" ).reset_index() ) multiregion = timeseries.MultiRegionTimeseriesDataset.from_timeseries_and_latest( ts, ts.latest_values_object() ) pd.testing.assert_frame_equal( ts.data, multiregion.data.drop(columns=[CommonFields.LOCATION_ID]) ) ts_again = multiregion.to_timeseries() pd.testing.assert_frame_equal(ts.data, ts_again.data.drop(columns=[CommonFields.LOCATION_ID]))
def test_make_latest_from_timeseries_dont_touch_county(): data = read_csv_and_index_fips_date( "fips,county,state,country,date,aggregate_level,m1,m2\n" "95123,Smith Countyy,YY,USA,2020-04-01,county,1,\n" "97123,Smith Countzz,ZZ,USA,2020-04-01,county,2,\n" "97,,ZZ,USA,2020-04-01,state,3,\n").reset_index() ts = TimeseriesDataset(data) assert to_dict(["fips"], ts.latest_values()[["fips", "county", "m1", "m2"]]) == { "95123": { "m1": 1, "county": "Smith Countyy" }, "97123": { "m1": 2, "county": "Smith Countzz" }, "97": { "m1": 3 }, }