Example #1
0
def test_multiregion_provenance():
    input_df = read_csv_and_index_fips_date(
        "fips,county,aggregate_level,date,m1,m2\n"
        "97111,Bar County,county,2020-04-01,1,\n"
        "97111,Bar County,county,2020-04-02,2,\n"
        "97222,Foo County,county,2020-04-01,,10\n"
        "97222,Foo County,county,2020-04-03,3,30\n"
        "03,,state,2020-04-03,4,40\n"
    )
    provenance = combined_datasets.provenance_wide_metrics_to_series(
        read_csv_and_index_fips_date(
            "fips,date,m1,m2\n"
            "97111,2020-04-01,src11,\n"
            "97111,2020-04-02,src11,\n"
            "97222,2020-04-01,,src22\n"
            "97222,2020-04-03,src21,src22\n"
            "03,2020-04-03,src31,src32\n"
        ),
        structlog.get_logger(),
    )
    ts = timeseries.TimeseriesDataset(input_df.reset_index(), provenance=provenance)
    out = timeseries.MultiRegionTimeseriesDataset.from_timeseries_and_latest(
        ts, ts.latest_values_object()
    )
    # Use loc[...].at[...] as work-around for https://github.com/pandas-dev/pandas/issues/26989
    assert out.provenance.loc["iso1:us#fips:97111"].at["m1"] == "src11"
    assert out.provenance.loc["iso1:us#fips:97222"].at["m2"] == "src22"
    assert out.provenance.loc["iso1:us#fips:03"].at["m2"] == "src32"

    counties = out.get_counties(after=pd.to_datetime("2020-04-01"))
    assert "iso1:us#fips:03" not in counties.provenance.index
    assert counties.provenance.loc["iso1:us#fips:97222"].at["m1"] == "src21"
Example #2
0
def test_one_region_dataset():
    ts = timeseries.OneRegionTimeseriesDataset(
        read_csv_and_index_fips_date(
            "fips,county,aggregate_level,date,m1,m2\n" "97111,Bar County,county,2020-04-02,2,\n"
        ).reset_index(),
        {},
    )
    assert ts.has_one_region() == True

    with pytest.raises(ValueError):
        timeseries.OneRegionTimeseriesDataset(
            read_csv_and_index_fips_date(
                "fips,county,aggregate_level,date,m1,m2\n"
                "97111,Bar County,county,2020-04-02,2,\n"
                "97222,Foo County,county,2020-04-01,,10\n"
            ).reset_index(),
            {},
        )

    with structlog.testing.capture_logs() as logs:
        ts = timeseries.OneRegionTimeseriesDataset(
            read_csv_and_index_fips_date("fips,county,aggregate_level,date,m1,m2\n").reset_index(),
            {},
        )
    assert [l["event"] for l in logs] == ["Creating OneRegionTimeseriesDataset with zero regions"]
    assert ts.empty
Example #3
0
def test_build_timeseries():
    data_a = read_csv_and_index_fips_date(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,1\n"
    )
    data_b = read_csv_and_index_fips_date(
        "county,state,fips,country,aggregate_level,date,cases\n"
        "Jones County,ZZ,97123,USA,county,2020-04-01,2\n"
    )
    datasets = {"source_a": data_a, "source_b": data_b}

    combined = _build_dataframe({"cases": ["source_a", "source_b"]}, datasets)
    assert combined.at[("97123", "2020-04-01"), "cases"] == 2

    combined = _build_dataframe({"cases": ["source_b", "source_a"]}, datasets)
    assert combined.at[("97123", "2020-04-01"), "cases"] == 1
Example #4
0
def test_multi_region_to_from_timeseries_and_latest_values(tmp_path: pathlib.Path):
    ts = timeseries.TimeseriesDataset(
        read_csv_and_index_fips_date(
            "fips,county,aggregate_level,date,m1,m2\n"
            "97111,Bar County,county,2020-04-02,2,\n"
            "97222,Foo County,county,2020-04-01,,10\n"
            "01,,state,2020-04-01,,20\n"
        ).reset_index()
    )
    latest_values = timeseries.LatestValuesDataset(
        read_csv_and_index_fips(
            "fips,county,aggregate_level,c1,c2\n"
            "97111,Bar County,county,3,\n"
            "97222,Foo County,county,4,10.5\n"
            "01,,state,,123.4\n"
        ).reset_index()
    )
    multiregion = timeseries.MultiRegionTimeseriesDataset.from_timeseries_and_latest(
        ts, latest_values
    )
    region_97111 = multiregion.get_one_region(Region.from_fips("97111"))
    assert region_97111.date_indexed.at["2020-04-02", "m1"] == 2
    assert region_97111.latest["c1"] == 3
    assert multiregion.get_one_region(Region.from_fips("01")).latest["c2"] == 123.4

    csv_path = tmp_path / "multiregion.csv"
    multiregion.to_csv(csv_path)
    multiregion_loaded = timeseries.MultiRegionTimeseriesDataset.from_csv(csv_path)
    region_97111 = multiregion_loaded.get_one_region(Region.from_fips("97111"))
    assert region_97111.date_indexed.at["2020-04-02", "m1"] == 2
    assert region_97111.latest["c1"] == 3
    assert multiregion_loaded.get_one_region(Region.from_fips("01")).latest["c2"] == 123.4
Example #5
0
def test_fix_missing_pos():
    df = read_csv_and_index_fips_date(
        "fips,date,negative_tests,positive_tests,total_tests,cases\n"
        "97123,2020-04-01,9,1,10,1\n"
        "97123,2020-04-02,17,,20,2\n"
        "97123,2020-04-03,26,4,30,4\n").reset_index()

    result_df, provenance = CovidCountyDataDataSource.synthesize_test_metrics(
        df)

    assert to_dict([CommonFields.FIPS, CommonFields.DATE], result_df) == {
        ("97123", pd.to_datetime("2020-04-01")): {
            CommonFields.NEGATIVE_TESTS: 9,
            CommonFields.POSITIVE_TESTS: 1,
            CommonFields.TOTAL_TESTS: 10,
            CommonFields.CASES: 1,
        },
        ("97123", pd.to_datetime("2020-04-02")): {
            CommonFields.NEGATIVE_TESTS: 17,
            CommonFields.POSITIVE_TESTS: 3,
            CommonFields.TOTAL_TESTS: 20,
            CommonFields.CASES: 2,
        },
        ("97123", pd.to_datetime("2020-04-03")): {
            CommonFields.NEGATIVE_TESTS: 26,
            CommonFields.POSITIVE_TESTS: 4,
            CommonFields.TOTAL_TESTS: 30,
            CommonFields.CASES: 4,
        },
    }
    assert provenance.to_dict() == {
        ("97123", CommonFields.NEGATIVE_TESTS): "none",
        ("97123", CommonFields.POSITIVE_TESTS): "none;missing_pos",
    }
Example #6
0
def test_wide_dates():
    input_df = read_csv_and_index_fips_date(
        "fips,county,aggregate_level,date,m1,m2\n"
        "97111,Bar County,county,2020-04-01,1,\n"
        "97111,Bar County,county,2020-04-02,2,\n"
        "97222,Foo County,county,2020-04-01,,10\n"
        "97222,Foo County,county,2020-04-03,3,30\n")
    provenance = provenance_wide_metrics_to_series(
        read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                     "97111,2020-04-01,src11,\n"
                                     "97111,2020-04-02,src11,\n"
                                     "97222,2020-04-01,,src22\n"
                                     "97222,2020-04-03,src21,src22\n"),
        structlog.get_logger(),
    )

    ts = TimeseriesDataset(input_df.reset_index(), provenance=provenance)
    date_columns = ts.get_date_columns()
    assert to_dict(["fips", "variable"], date_columns["value"]) == {
        ("97111", "m1"): {
            pd.to_datetime("2020-04-01"): 1.0,
            pd.to_datetime("2020-04-02"): 2.0
        },
        ("97222", "m1"): {
            pd.to_datetime("2020-04-03"): 3.0
        },
        ("97222", "m2"): {
            pd.to_datetime("2020-04-01"): 10.0,
            pd.to_datetime("2020-04-03"): 30.0
        },
    }
    assert to_dict(["fips", "variable"], date_columns["provenance"]) == {
        ("97111", "m1"): {
            "value": "src11"
        },
        ("97222", "m1"): {
            "value": "src21"
        },
        ("97222", "m2"): {
            "value": "src22"
        },
    }
Example #7
0
def test_build_timeseries_override():
    data_a = read_csv_and_index_fips_date(
        "fips,date,m1,m2\n" "97123,2020-04-01,1,\n" "97123,2020-04-02,,\n" "97123,2020-04-03,3,3"
    )
    data_b = read_csv_and_index_fips_date(
        "fips,date,m1,m2\n" "97123,2020-04-01,,\n" "97123,2020-04-02,2,\n"
    )
    datasets = {"source_a": data_a, "source_b": data_b}

    # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1
    combined = _build_dataframe(
        {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_TIMESERIES
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [None, 2, None]

    # The combined m1 timeseries is the highest priority real value for each date, a blend of source_a and source_b.
    combined = _build_dataframe(
        {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_TIMESERIES_POINT
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, 2, 3]

    # The combined m1 timeseries is the highest priority value for each date; source_b is higher priority for
    # both 2020-04-01 and 2020-04-02.
    combined = _build_dataframe(
        {"m1": ["source_a", "source_b"]}, datasets, override=Override.BY_ROW
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [None, 2, 3]

    combined = _build_dataframe(
        {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_TIMESERIES
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, None, 3]

    combined = _build_dataframe(
        {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_TIMESERIES_POINT
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, 2, 3]

    combined = _build_dataframe(
        {"m1": ["source_b", "source_a"]}, datasets, override=Override.BY_ROW
    )
    assert combined.loc["97123", "m1"].replace({np.nan: None}).tolist() == [1, None, 3]
def test_build_and_and_provenance_missing_fips():
    data_a = read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                          "97111,2020-04-01,1,\n"
                                          "97111,2020-04-02,,\n"
                                          "97111,2020-04-03,3,3\n")
    data_b = read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                          "97111,2020-04-01,,\n"
                                          "97111,2020-04-02,2,\n"
                                          "97444,2020-04-04,4,\n")
    datasets = {"source_a": data_a, "source_b": data_b}

    # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1
    combined, provenance = _build_data_and_provenance(
        {
            "m1": ["source_a", "source_b"],
            "m2": ["source_a", "source_b"]
        }, datasets)
    assert combined.loc["97444", "m1"].dropna().tolist() == [4]
    assert provenance.loc["97444", "m1"].dropna().tolist() == ["source_b"]
    assert combined.loc["97444", "m2"].dropna().tolist() == []
    assert provenance.loc["97444", "m2"].dropna().tolist() == []
def test_build_timeseries_override():
    data_a = read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                          "97123,2020-04-01,1,\n"
                                          "97123,2020-04-02,,\n"
                                          "97123,2020-04-03,3,3")
    data_b = read_csv_and_index_fips_date("fips,date,m1,m2\n"
                                          "97123,2020-04-01,,\n"
                                          "97123,2020-04-02,2,\n")
    datasets = {"source_a": data_a, "source_b": data_b}

    # The combined m1 timeseries is copied from the timeseries in source_b; source_a is not used for m1
    combined, provenance = _build_data_and_provenance(
        {"m1": ["source_a", "source_b"]},
        datasets,
    )
    assert combined.loc["97123", "m1"].replace({
        np.nan: None
    }).tolist() == [None, 2, None]
    assert provenance.loc["97123", "m1"].replace({
        np.nan: None
    }).tolist() == [
        None,
        "source_b",
        None,
    ]

    # The combined m1 timeseries is the highest priority value for each date; source_b is higher priority for
    # both 2020-04-01 and 2020-04-02.
    combined, provenance = _build_data_and_provenance(
        {"m1": ["source_b", "source_a"]}, datasets)
    assert combined.loc["97123", "m1"].replace({
        np.nan: None
    }).tolist() == [1, None, 3]
    assert provenance.loc["97123", "m1"].replace({
        np.nan: None
    }).tolist() == [
        "source_a",
        None,
        "source_a",
    ]
def test_make_latest_from_timeseries_simple():
    data = read_csv_and_index_fips_date(
        "fips,county,state,country,date,aggregate_level,m1,m2\n"
        "97123,Smith County,ZZ,USA,2020-04-01,county,1,\n"
        "97123,Smith County,ZZ,USA,2020-04-02,county,,2\n").reset_index()
    ts = TimeseriesDataset(data)
    assert to_dict(["fips"],
                   ts.latest_values()[["fips", "m1", "m2"]]) == {
                       "97123": {
                           "m1": 1,
                           "m2": 2
                       }
                   }
def test_melt_provenance_multiple_sources():
    wide = read_csv_and_index_fips_date("fips,date,cases,recovered\n"
                                        "97111,2020-04-01,source_a,source_b\n"
                                        "97111,2020-04-02,source_x,\n"
                                        "97222,2020-04-01,source_c,\n")
    with structlog.testing.capture_logs() as logs:
        long = provenance_wide_metrics_to_series(wide, structlog.get_logger())

    assert [l["event"] for l in logs] == ["Multiple rows for a timeseries"]

    assert long.to_dict() == {
        ("97111", "cases"): "source_a;source_x",
        ("97111", "recovered"): "source_b",
        ("97222", "cases"): "source_c",
    }
def test_melt_provenance():
    wide = read_csv_and_index_fips_date("fips,date,cases,recovered\n"
                                        "97111,2020-04-01,source_a,source_b\n"
                                        "97111,2020-04-02,source_a,\n"
                                        "97222,2020-04-01,source_c,\n")
    with structlog.testing.capture_logs() as logs:
        long = provenance_wide_metrics_to_series(wide, structlog.get_logger())

    assert logs == []

    assert long.to_dict() == {
        ("97111", "cases"): "source_a",
        ("97111", "recovered"): "source_b",
        ("97222", "cases"): "source_c",
    }
Example #13
0
def test_multi_region_to_from_timeseries():
    ts = timeseries.TimeseriesDataset(
        read_csv_and_index_fips_date(
            "fips,county,aggregate_level,date,m1,m2\n"
            "97111,Bar County,county,2020-04-02,2,\n"
            "97222,Foo County,county,2020-04-01,,10\n"
            "01,,state,2020-04-01,,20\n"
        ).reset_index()
    )
    multiregion = timeseries.MultiRegionTimeseriesDataset.from_timeseries_and_latest(
        ts, ts.latest_values_object()
    )
    pd.testing.assert_frame_equal(
        ts.data, multiregion.data.drop(columns=[CommonFields.LOCATION_ID])
    )

    ts_again = multiregion.to_timeseries()
    pd.testing.assert_frame_equal(ts.data, ts_again.data.drop(columns=[CommonFields.LOCATION_ID]))
def test_make_latest_from_timeseries_dont_touch_county():
    data = read_csv_and_index_fips_date(
        "fips,county,state,country,date,aggregate_level,m1,m2\n"
        "95123,Smith Countyy,YY,USA,2020-04-01,county,1,\n"
        "97123,Smith Countzz,ZZ,USA,2020-04-01,county,2,\n"
        "97,,ZZ,USA,2020-04-01,state,3,\n").reset_index()
    ts = TimeseriesDataset(data)
    assert to_dict(["fips"],
                   ts.latest_values()[["fips", "county", "m1", "m2"]]) == {
                       "95123": {
                           "m1": 1,
                           "county": "Smith Countyy"
                       },
                       "97123": {
                           "m1": 2,
                           "county": "Smith Countzz"
                       },
                       "97": {
                           "m1": 3
                       },
                   }