def test_remove_county_backfill():

    backfill = [("48113", "2020-08-17", 500)]
    data_buf = io.StringIO("fips,state,date,aggregate_level,cases\n"
                           f"48112,TX,2020-08-17,county,1700\n"
                           f"48112,TX,2020-08-18,county,1700\n"
                           f"48113,TX,2020-08-16,county,1000\n"
                           f"48113,TX,2020-08-17,county,1600\n"
                           f"48113,TX,2020-08-18,county,1700\n"
                           f"48,TX,2020-08-16,state,2600\n"
                           f"48,TX,2020-08-17,state,3600\n"
                           f"48,TX,2020-08-18,state,4700\n")
    data = common_df.read_csv(data_buf, set_index=False)
    results = update_nytimes_data.remove_county_backfilled_cases(
        data, backfill)

    # Days before 8/17 should be the same
    # days on/after 8/17 should have 500 less cases
    data_buf = io.StringIO("fips,state,date,aggregate_level,cases\n"
                           f"48112,TX,2020-08-17,county,1700\n"
                           f"48112,TX,2020-08-18,county,1700\n"
                           f"48113,TX,2020-08-16,county,1000\n"
                           f"48113,TX,2020-08-17,county,1100\n"
                           f"48113,TX,2020-08-18,county,1200\n"
                           f"48,TX,2020-08-16,state,2600\n"
                           f"48,TX,2020-08-17,state,3100\n"
                           f"48,TX,2020-08-18,state,4200\n")
    expected = common_df.read_csv(data_buf, set_index=False)

    pd.testing.assert_frame_equal(results, expected)
Exemple #2
0
def test_query_source_url():
    variable = ccd_helpers.ScraperVariable(
        variable_name="total_vaccine_completed",
        measurement="cumulative",
        unit="people",
        provider="cdc",
        common_field=CommonFields.VACCINATIONS_COMPLETED,
    )

    input_data = build_can_scraper_dataframe({variable: [10, 20, 30]},
                                             source_url="http://foo.com")
    data = ccd_helpers.CanScraperLoader(input_data)
    results, tags = data.query_multiple_variables([variable])

    expected_data_buf = io.StringIO(
        "fips,      date,aggregate_level,vaccinations_completed\n"
        "  36,2021-01-01,          state,                    10\n"
        "  36,2021-01-02,          state,                    20\n"
        "  36,2021-01-03,          state,                    30\n".replace(
            " ", ""))
    expected = common_df.read_csv(expected_data_buf, set_index=False)
    pd.testing.assert_frame_equal(expected, results)

    expected_tag_buf = io.StringIO(
        "fips,      date,              variable,       content\n"
        "  36,2021-01-01,vaccinations_completed,http://foo.com\n"
        "  36,2021-01-02,vaccinations_completed,http://foo.com\n"
        "  36,2021-01-03,vaccinations_completed,http://foo.com\n".replace(
            " ", ""))
    expected = common_df.read_csv(expected_tag_buf, set_index=False)
    pd.testing.assert_frame_equal(expected, tags, check_like=True)
def test_remove_ma_county_cases():
    data_buf = io.StringIO("fips,state,date,aggregate_level,cases\n"
                           "25025,MA,2020-08-10,county,1000\n"
                           "25025,MA,2020-08-11,county,1000\n"
                           "25025,MA,2020-08-12,county,1000\n"
                           "25025,MA,2020-08-13,county,1000\n"
                           "25025,MA,2020-08-14,county,1025\n"
                           "25025,MA,2020-08-19,county,1030\n"
                           "25025,MA,2020-08-20,county,1030\n"
                           "25,MA,2020-08-11,state,1000\n"
                           "25,MA,2020-08-12,state,1000\n"
                           "25,MA,2020-08-13,state,1000\n")
    data = common_df.read_csv(data_buf, set_index=False)

    results = update_nytimes_data._remove_ma_county_zeroes_data(data)
    results = results.sort_values(["fips", "date"]).reset_index(drop=True)
    # State data should be untouched
    # MA County data on/before 8/11 should be untouched
    # MA County data that changes after 8/11 should be picked up.
    data_buf = io.StringIO("fips,state,date,aggregate_level,cases\n"
                           "25,MA,2020-08-11,state,1000\n"
                           "25,MA,2020-08-12,state,1000\n"
                           "25,MA,2020-08-13,state,1000\n"
                           "25025,MA,2020-08-10,county,1000\n"
                           "25025,MA,2020-08-11,county,1000\n"
                           "25025,MA,2020-08-14,county,1025\n"
                           "25025,MA,2020-08-19,county,1030\n"
                           "25025,MA,2020-08-20,county,1030\n")
    expected = common_df.read_csv(data_buf, set_index=False)
    pd.testing.assert_frame_equal(results, expected)
def test_transform_icu_greater_than_hospitalized():
    in_df = common_df.read_csv(
        StringIO(
            "date,state,positive,negative,fips,pending,inIcuCurrently,hospitalizedCurrently\n"
            "20200401,TX,10,1000,48,,10,100\n"
            "20200402,TX,11,1100,48,,1500,150\n"),
        set_index=False,
    )
    with structlog.testing.capture_logs() as logs:
        out_df = update_covid_tracking_data.transform(in_df)

    expected_df = common_df.read_csv(
        StringIO(
            "date,state,country,aggregate_level,positive_tests,negative_tests,fips,current_icu,current_hospitalized\n"
            "2020-04-01,TX,USA,state,10,1000,48,10,100\n"
            "2020-04-02,TX,USA,state,11,1100,48,,150\n"),
        set_index=False,
    )

    assert to_dict(["fips", "date"], out_df) == to_dict(["fips", "date"],
                                                        expected_df)

    assert [l["event"] for l in logs] == [
        ICU_HOSPITALIZED_MISMATCH_WARNING_MESSAGE,
        helpers.MISSING_COLUMNS_MESSAGE,
    ]
Exemple #5
0
def csv_diff(csv_path_left, csv_path_right):
    """Compare 2 CSV files."""
    df_l = common_df.read_csv(csv_path_left)
    df_r = common_df.read_csv(csv_path_right)

    differ_l = DatasetDiff.make(df_l)
    differ_r = DatasetDiff.make(df_r)
    differ_l.compare(differ_r)

    print(f"File: {csv_path_left}")
    print(differ_l)
    print(f"File: {csv_path_right}")
    print(differ_r)
    def load_state_and_county_data(self) -> pd.DataFrame:
        """Loads state and county data in one dataset, renaming fields to common field names. """
        _logger.info("Updating NYTimes dataset.")
        # Able to use common_df here because the NYTimes raw files include fips and date.
        county_data = common_df.read_csv(self.county_path).reset_index()
        county_data = helpers.rename_fields(county_data, Fields, set(),
                                            _logger)
        county_data[CommonFields.AGGREGATE_LEVEL] = "county"

        # Able to use common_df here because the NYTimes raw files include fips and date.
        state_data = common_df.read_csv(self.state_path).reset_index()
        state_data = helpers.rename_fields(state_data, Fields, set(), _logger)
        state_data[CommonFields.AGGREGATE_LEVEL] = "state"

        return pd.concat([county_data, state_data])
 def local(cls):
     data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH
     input_path = data_root / cls.DATA_PATH
     data = common_df.read_csv(input_path).reset_index()
     data, provenance = cls.synthesize_test_metrics(data)
     # Column names are already CommonFields so don't need to rename
     return cls(data, provenance=provenance)
def test_icu_utilization_metric():

    data = io.StringIO(
        "date,fips,current_icu,current_icu_total,icu_beds\n"
        "2020-08-11,36,20,40,40\n"
        "2020-08-12,36,15,30,40\n"
        "2020-08-13,36,20,,40\n"
    )
    data = common_df.read_csv(data, set_index=False).set_index(CommonFields.DATE)
    estimated_icu = pd.Series([30, 30, np.nan], index=data.index)

    icu_data = ICUMetricData(data, estimated_icu, {}, 0.0, require_recent_data=False)

    metrics, details = icu_headroom.calculate_icu_utilization_metric(icu_data)

    expected_metric = pd.Series([1.0, 0.6, np.nan], index=data.index)

    expected_details = can_api_v2_definition.ICUHeadroomMetricDetails(
        currentIcuCovidMethod=icu_headroom.CovidPatientsMethod.ACTUAL,
        currentIcuCovid=15,
        currentIcuNonCovidMethod=icu_headroom.NonCovidPatientsMethod.ACTUAL,
        currentIcuNonCovid=15,
    )

    pd.testing.assert_series_equal(metrics, expected_metric)
    assert details == expected_details
Exemple #9
0
def test_icu_metric_data_with_estimated_from_decomp_and_latest_total_beds():
    latest = {
        CommonFields.ICU_TYPICAL_OCCUPANCY_RATE: 0.5,
        CommonFields.ICU_BEDS: 50
    }
    data = io.StringIO("date,fips,current_icu,current_icu_total,icu_beds\n"
                       "2020-08-10,36,,,\n"
                       "2020-08-11,35,,,\n")
    data = common_df.read_csv(data,
                              set_index=False).set_index(CommonFields.DATE)
    estimated_icu = pd.Series([20, 30], index=data.index)

    icu_data = ICUMetricData(data,
                             estimated_icu,
                             latest,
                             0.0,
                             require_recent_data=False)
    assert not icu_data.actual_current_icu_covid

    non_covid, source = icu_data.current_icu_non_covid_with_source
    pd.testing.assert_series_equal(non_covid,
                                   pd.Series([25.0, 25.0], index=data.index))
    assert source is icu_headroom_metric.NonCovidPatientsMethod.ESTIMATED_FROM_TYPICAL_UTILIZATION

    covid, source = icu_data.current_icu_covid_with_source
    pd.testing.assert_series_equal(covid, estimated_icu)
    assert source is icu_headroom_metric.CovidPatientsMethod.ESTIMATED
Exemple #10
0
def test_icu_metric_data_with_all_timeseries_actuals():

    data = io.StringIO("date,fips,current_icu,current_icu_total,icu_beds\n"
                       "2020-08-10,36,10,25,50\n"
                       "2020-08-11,35,20,40,50\n")
    data = common_df.read_csv(data,
                              set_index=False).set_index(CommonFields.DATE)
    estimated_icu = pd.Series([20, 30], index=data.index)

    icu_data = ICUMetricData(data,
                             estimated_icu, {},
                             0.0,
                             require_recent_data=False)
    pd.testing.assert_series_equal(icu_data.actual_current_icu_covid,
                                   data.current_icu)
    pd.testing.assert_series_equal(icu_data.estimated_current_icu_covid,
                                   estimated_icu)
    pd.testing.assert_series_equal(icu_data.actual_current_icu_total,
                                   data.current_icu_total)
    pd.testing.assert_series_equal(icu_data.total_icu_beds, data.icu_beds)

    non_covid, source = icu_data.current_icu_non_covid_with_source
    pd.testing.assert_series_equal(non_covid,
                                   pd.Series([15, 20], index=data.index))
    assert source is icu_headroom_metric.NonCovidPatientsMethod.ACTUAL

    covid, source = icu_data.current_icu_covid_with_source
    pd.testing.assert_series_equal(covid, data.current_icu)
    assert source is icu_headroom_metric.CovidPatientsMethod.ACTUAL
Exemple #11
0
 def load_csv(cls, path_or_buf: Union[pathlib.Path, TextIO]):
     df = common_df.read_csv(path_or_buf)
     # TODO: common_df.read_csv sets the index of the dataframe to be fips, date, however
     # most of the calling code expects fips and date to not be in an index.
     # In the future, it would be good to standardize around index fields.
     df = df.reset_index()
     return cls(df)
Exemple #12
0
def test_get_timeseries():
    df = common_df.read_csv(
        StringIO(
            "fips,date,cases,total_tests\n" "06045,2020-04-01,234,\n" "45123,2020-04-02,456,\n"
        )
    )

    EMPTY_TS = pd.Series([], dtype="float64")

    # Check getting by CommonFields.CASES and literal str
    expected_index = pd.MultiIndex.from_tuples(
        [("06045", date(2020, 4, 1)), ("45123", date(2020, 4, 2))], names=["fips", "date"]
    )
    expected_ts = pd.Series([234, 456], name="cases", index=expected_index)
    pd.testing.assert_series_equal(
        common_df.get_timeseries(df, CommonFields.CASES, EMPTY_TS), expected_ts
    )
    pd.testing.assert_series_equal(common_df.get_timeseries(df, "cases", EMPTY_TS), expected_ts)

    # Check that getting a metric with doesn't have any real values does not return EMPTY_TS.
    expected_ts = pd.Series([np.nan, np.nan], name=CommonFields.TOTAL_TESTS, index=expected_index)
    pd.testing.assert_series_equal(
        common_df.get_timeseries(df, CommonFields.TOTAL_TESTS, EMPTY_TS), expected_ts
    )

    # Check that getting a metric that isn't found returns EMPTY_TS
    assert common_df.get_timeseries(df, CommonFields.DEATHS, EMPTY_TS) is EMPTY_TS
    assert common_df.get_timeseries(df, "deaths", EMPTY_TS) is EMPTY_TS
 def local(cls) -> "CDSDataset":
     data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH
     df = common_df.read_csv(data_root / cls.DATA_PATH).reset_index()
     df[CommonFields.POSITIVE_TESTS] = df[CommonFields.CASES]
     # Column names are already CommonFields so don't need to rename, but do need to drop extra
     # columns that will fail NYC aggregation.
     return cls(cls._drop_unlisted_fields(df))
Exemple #14
0
def test_query_multiple_variables():
    variable = ccd_helpers.ScraperVariable(
        variable_name="total_vaccine_completed",
        measurement="cumulative",
        unit="people",
        provider="cdc",
        common_field=CommonFields.VACCINATIONS_COMPLETED,
    )
    not_included_variable = ccd_helpers.ScraperVariable(
        variable_name="total_vaccine_completed",
        measurement="cumulative",
        unit="people",
        # Different provider, so query shouldn't return it
        provider="hhs",
        common_field=CommonFields.VACCINATIONS_COMPLETED,
    )

    input_data = build_can_scraper_dataframe({
        variable: [10, 20, 30],
        not_included_variable: [10, 20, 40]
    })
    data = ccd_helpers.CanScraperLoader(input_data)
    results, _ = data.query_multiple_variables([variable])

    expected_buf = io.StringIO(
        "fips,date,aggregate_level,vaccinations_completed\n"
        f"36,2021-01-01,state,10\n"
        f"36,2021-01-02,state,20\n"
        f"36,2021-01-03,state,30\n")
    expected = common_df.read_csv(expected_buf, set_index=False)
    pd.testing.assert_frame_equal(expected, results)
Exemple #15
0
 def make_dataset(cls) -> timeseries.MultiRegionDataset:
     """Default implementation of make_dataset that loads timeseries data from a CSV."""
     assert cls.COMMON_DF_CSV_PATH, f"No path in {cls}"
     data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH
     input_path = data_root / cls.COMMON_DF_CSV_PATH
     data = common_df.read_csv(input_path, set_index=False)
     data = cls._check_data(data)
     return MultiRegionDataset.from_fips_timeseries_df(
         data).add_provenance_all(cls.SOURCE_NAME)
def test_update_nytimes_virgin_islands():

    updater = NYTimesUpdater.make_with_data_root(DATA_ROOT)
    data = common_df.read_csv(
        io.StringIO(
            "county,state_full_name,aggregate_level,fips,date,cases,deaths\n"
            ",Virgin Islands,state,78,2020-07-31,10,1\n")).reset_index()
    results = updater.transform(data)

    expected = common_df.read_csv(
        io.StringIO(
            "country,county,state_full_name,state,aggregate_level,fips,date,cases,deaths\n"
            "USA,,U.S. Virgin Islands,VI,state,78,2020-07-31,10,1\n")
    ).reset_index()
    results_dict = common_test_helpers.to_dict(["state", "state_full_name"],
                                               data)
    expected_dict = common_test_helpers.to_dict(["state", "state_full_name"],
                                                expected)
    assert results_dict == expected_dict
 def local(cls):
     """
     This currently returns an empty DataFrame because _rename_to_common_fields restricts
     output to only columns found in INDEX_FIELD_MAP and COMMON_FIELD_MAP. This is not yet
     implemented because it is not required for this dataset to be merged via the combined
     dataset pathway. Specifically, which quantiles to persist has not finalized and as such
     they are not included in CommonFields and would be filtered out regardless.
     """
     data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH
     input_path = data_root / cls.DATA_PATH
     data = common_df.read_csv(input_path, set_index=False)
     return cls(cls._rename_to_common_fields(data))
Exemple #18
0
def test_read_csv_no_index():
    input_csv = """fips,date,cases
06045,2020-04-01,234
45123,2020-04-02,456
    """

    with temppathlib.NamedTemporaryFile("w+") as tmp:
        tmp.path.write_text(input_csv)
        df = common_df.read_csv(tmp.path, set_index=False)

    expected_first_row = ["06045", pd.Timestamp("2020-04-01 00:00:00"), 234]
    assert list(df.iloc[0]) == expected_first_row
def test_remove_trailing_zeros():

    data_buf = io.StringIO("fips,date,test_positivity_7d\n"
                           f"48112,2020-08-17,0.5\n"
                           f"48112,2020-08-18,0.6\n"
                           f"48112,2020-08-19,0.0\n"
                           f"48113,2020-08-16,0.0\n"
                           f"48113,2020-08-17,0.0\n"
                           f"48113,2020-08-18,0.0\n")
    data = common_df.read_csv(data_buf, set_index=False)
    results = update_cdc_test_data.remove_trailing_zeros(data)

    expected_buf = io.StringIO("fips,date,test_positivity_7d\n"
                               f"48112,2020-08-17,0.5\n"
                               f"48112,2020-08-18,0.6\n"
                               f"48112,2020-08-19,\n"
                               f"48113,2020-08-16,\n"
                               f"48113,2020-08-17,\n"
                               f"48113,2020-08-18,\n")
    expected = common_df.read_csv(expected_buf, set_index=False)
    pd.testing.assert_frame_equal(expected.sort_index(axis=1),
                                  results.sort_index(axis=1))
Exemple #20
0
def csv_diff(csv_path_or_rev_left, csv_path_right):
    """Compare 2 CSV files."""
    left_path = pathlib.Path(csv_path_or_rev_left)
    right_path = pathlib.Path(csv_path_right)

    if left_path.exists():
        left_data = left_path.read_bytes()
    else:
        repo = git.Repo(dataset_utils.REPO_ROOT)
        left_data = read_data_for_commit(repo, right_path,
                                         repo.commit(csv_path_or_rev_left))

    df_l = common_df.read_csv(BytesIO(left_data))
    df_r = common_df.read_csv(csv_path_right)

    differ_l = DatasetDiff.make(df_l)
    differ_r = DatasetDiff.make(df_r)
    differ_l.compare(differ_r)

    print(f"File: {csv_path_or_rev_left}")
    print(differ_l)
    print(f"File: {csv_path_right}")
    print(differ_r)
def test_icu_metric_data_with_estimated_from_total_icu_actuals():
    latest = {}
    data = io.StringIO(
        "date,fips,current_icu,current_icu_total,icu_beds\n"
        "2020-08-10,36,,25,50\n"
        "2020-08-11,35,,40,50\n"
    )
    data = common_df.read_csv(data, set_index=False).set_index(CommonFields.DATE)
    estimated_icu = pd.Series([20, 30], index=data.index)

    icu_data = ICUMetricData(data, estimated_icu, latest, 0.0, require_recent_data=False)
    assert not icu_data.actual_current_icu_covid

    non_covid, source = icu_data.current_icu_non_covid_with_source
    pd.testing.assert_series_equal(non_covid, pd.Series([5, 10], index=data.index))
    assert source is icu_headroom.NonCovidPatientsMethod.ESTIMATED_FROM_TOTAL_ICU_ACTUAL

    covid, source = icu_data.current_icu_covid_with_source
    pd.testing.assert_series_equal(covid, estimated_icu)
    assert source is icu_headroom.CovidPatientsMethod.ESTIMATED
def test_remove_ct_cases(is_ct_county):
    backfill_records = [("09", "2020-07-24", 188)]
    if is_ct_county:
        fips = "09001"
    else:
        fips = "36061"

    data_buf = io.StringIO("fips,state,date,aggregate_level,cases\n"
                           f"{fips},CT,2020-07-23,county,1000\n"
                           f"{fips},CT,2020-07-24,county,1288\n"
                           f"{fips},CT,2020-07-25,county,1388\n")

    data = common_df.read_csv(data_buf)
    data = data.reset_index()

    results = update_nytimes_data.remove_state_backfilled_cases(
        data, backfill_records)

    if is_ct_county:
        expected_cases = pd.Series([1000, 1100, 1200], name="cases")
    else:
        expected_cases = pd.Series([1000, 1288, 1388], name="cases")

    pd.testing.assert_series_equal(expected_cases, results.cases)
Exemple #23
0
def test_all_columns_na():
    # MultiRegionDataset.from_csv drops columns with no real values so make a DataFrame
    # to pass to from_timeseries_df.
    ts_df = common_df.read_csv(
        io.StringIO(
            "location_id,date,positive_tests,total_tests\n"
            "iso1:us#iso2:tx,2020-04-01,,\n"
            "iso1:us#iso2:tx,2020-04-02,,\n"
            "iso1:us#iso2:tx,2020-04-04,,\n"
        ),
        set_index=False,
    )
    ts_df[CommonFields.POSITIVE_TESTS] = pd.NA
    ts = timeseries.MultiRegionDataset.from_geodata_timeseries_df(ts_df)
    methods = [
        DivisionMethod(
            DatasetName("method2"),
            CommonFields.POSITIVE_TESTS,
            CommonFields.TOTAL_TESTS,
            recent_days=1,
        ),
    ]
    with pytest.raises(test_positivity.NoRealTimeseriesValuesException):
        AllMethods.run(ts, methods, diff_days=1)
 def from_csv(
     path_or_buf: Union[pathlib.Path, TextIO]
 ) -> "MultiRegionTimeseriesDataset":
     return MultiRegionTimeseriesDataset.from_combined_dataframe(
         common_df.read_csv(path_or_buf, set_index=False))
Exemple #25
0
 def local(cls):
     data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH
     input_path = data_root / cls.DATA_PATH
     data = common_df.read_csv(input_path).reset_index()
     return cls(cls._rename_to_common_fields(data))
Exemple #26
0
 def local(cls):
     data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH
     input_path = data_root / cls.DATA_PATH
     data = common_df.read_csv(input_path).reset_index()
     # Column names are already CommonFields so don't need to rename
     return cls(data)
Exemple #27
0
 def _load_data(cls) -> pd.DataFrame:
     """Loads the CSV, override to inject data in a test."""
     assert cls.COMMON_DF_CSV_PATH, f"No path in {cls}"
     data_root = dataset_utils.LOCAL_PUBLIC_DATA_PATH
     input_path = data_root / cls.COMMON_DF_CSV_PATH
     return common_df.read_csv(input_path, set_index=False)
Exemple #28
0
def _build_metrics_df(content: str) -> pd.DataFrame:
    header = (
        "date,fips,caseDensity,testPositivityRatio,contactTracerCapacityRatio,"
        "infectionRate,infectionRateCI90,icuHeadroomRatio\n")
    data = io.StringIO(f"{header}\n{content}")
    return common_df.read_csv(data, set_index=False)
 def local(cls) -> "CovidTrackingDataSource":
     data = common_df.read_csv(cls.INPUT_PATH).reset_index()
     # Column names are already CommonFields so don't need to rename
     return cls(data, provenance=None)