def test_multi_region_get_one_region(): ts = timeseries.MultiRegionTimeseriesDataset.from_csv( io.StringIO( "location_id,county,aggregate_level,date,m1,m2\n" "iso1:us#fips:97111,Bar County,county,2020-04-02,2,\n" "iso1:us#fips:97222,Foo County,county,2020-04-01,,10\n" "iso1:us#fips:97111,Bar County,county,,3,\n" "iso1:us#fips:97222,Foo County,county,,,11\n" ) ) region_97111_ts = ts.get_one_region(Region.from_fips("97111")) assert to_dict(["date"], region_97111_ts.data[["date", "m1", "m2"]]) == { pd.to_datetime("2020-04-02"): {"m1": 2} } assert region_97111_ts.latest["m1"] == 3 region_97222_ts = ts.get_one_region(Region.from_fips("97222")) assert to_dict(["date"], region_97222_ts.data) == { pd.to_datetime("2020-04-01"): { "m2": 10, "county": "Foo County", "fips": "97222", "location_id": "iso1:us#fips:97222", "aggregate_level": "county", } } assert region_97222_ts.latest["m2"] == 11
def test_transform(): with structlog.testing.capture_logs() as logs: transformer = update_covid_data_scraper.CovidDataScraperTransformer.make_with_data_root( update_covid_data_scraper.DATA_ROOT, structlog.get_logger(), ) transformer.timeseries_csv_local_path = StringIO( "locationID,county,country,state,level,cases,deaths,tested,date\n" "iso1:us#iso2:us-ak#fips:02013,Aleutians East Borough,United States,Alaska,county,10,1,100,2020-06-01\n" "iso1:us#iso2:us-ak#fips:02013,Aleutians East Borough,United States,Alaska,county,11,1,110,2020-06-02\n" "iso1:us#iso2:us-ak,,United States,Alaska,state,20,2,200,2020-06-01\n" "iso1:us#iso2:us-ak#(unassigned),,United States,Alaska,state,2000,200,20000,2020-06-01\n" "iso1:us#iso2:us-ak#(unassigned),,United States,Alaska,county,2000,200,20000,2020-06-01\n" ) df = transformer.transform() expected_df = pd.read_csv( StringIO( "country,county,state,fips,aggregate_level,date,cases,deaths,negative_tests\n" "USA,Aleutians East Borough,AK,02013,county,2020-06-01,10,1,90\n" "USA,Aleutians East Borough,AK,02013,county,2020-06-02,11,1,99\n" "USA,,AK,02,state,2020-06-01,20,2,180" ), dtype={CommonFields.FIPS: str}, low_memory=False, parse_dates=[CommonFields.DATE], ) assert to_dict(["fips", "date"], df) == to_dict(["fips", "date"], expected_df) assert [l["event"] for l in logs] == [ "Dropping county rows with unexpected locationID", "Dropping state rows with unexpected locationID", UNEXPECTED_COLUMNS_MESSAGE, ]
def test_transform_icu_greater_than_hospitalized(): in_df = common_df.read_csv( StringIO( "date,state,positive,negative,fips,pending,inIcuCurrently,hospitalizedCurrently\n" "20200401,TX,10,1000,48,,10,100\n" "20200402,TX,11,1100,48,,1500,150\n"), set_index=False, ) with structlog.testing.capture_logs() as logs: out_df = update_covid_tracking_data.transform(in_df) expected_df = common_df.read_csv( StringIO( "date,state,country,aggregate_level,positive_tests,negative_tests,fips,current_icu,current_hospitalized\n" "2020-04-01,TX,USA,state,10,1000,48,10,100\n" "2020-04-02,TX,USA,state,11,1100,48,,150\n"), set_index=False, ) assert to_dict(["fips", "date"], out_df) == to_dict(["fips", "date"], expected_df) assert [l["event"] for l in logs] == [ ICU_HOSPITALIZED_MISMATCH_WARNING_MESSAGE, helpers.MISSING_COLUMNS_MESSAGE, ]
def test_strip_whitespace(): input_df = pd.read_csv( StringIO("""col_a,col_b,col_c,col_num ,b1,c1,1 a2, b2,c2,2 ,b3," c3 ",3 """)) output_df = strip_whitespace(input_df) expected_df = pd.read_csv( StringIO("""col_a,col_b,col_c,col_num ,b1,c1,1 a2,b2,c2,2 ,b3,c3,3 """)) assert to_dict(["col_c"], output_df) == to_dict(["col_c"], expected_df)
def test_inference_ok_with_5_days_cases_changed(): # 5 days with cases data isn't enough to make inference_ok, 6 days are # needed so that there are 5 days with an *delta* relative to a previous day. csv_string_io = io.StringIO( "location_id,country,state,county,aggregate_level,date,cases,deaths\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-01,100,1\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-02,200,2\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-03,300,3\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-04,400,4\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-05,500,5\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-01,100,1\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-02,200,2\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-03,300,3\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-04,400,4\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-05,500,5\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,2020-04-06,600,6\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,,500,5\n" "iso1:us#fips:97222,US,ZZ,Foo County,county,,100,1\n") input_dataset = MultiRegionTimeseriesDataset.from_csv(csv_string_io) df = WhitelistGenerator().generate_whitelist(input_dataset) assert to_dict(["fips"], df) == { "97111": { "state": "ZZ", "county": "Bar County", "inference_ok": False }, "97222": { "state": "ZZ", "county": "Foo County", "inference_ok": True }, }
def test_fix_missing_pos(): df = read_csv_and_index_fips_date( "fips,date,negative_tests,positive_tests,total_tests,cases\n" "97123,2020-04-01,9,1,10,1\n" "97123,2020-04-02,17,,20,2\n" "97123,2020-04-03,26,4,30,4\n").reset_index() result_df, provenance = CovidCountyDataDataSource.synthesize_test_metrics( df) assert to_dict([CommonFields.FIPS, CommonFields.DATE], result_df) == { ("97123", pd.to_datetime("2020-04-01")): { CommonFields.NEGATIVE_TESTS: 9, CommonFields.POSITIVE_TESTS: 1, CommonFields.TOTAL_TESTS: 10, CommonFields.CASES: 1, }, ("97123", pd.to_datetime("2020-04-02")): { CommonFields.NEGATIVE_TESTS: 17, CommonFields.POSITIVE_TESTS: 3, CommonFields.TOTAL_TESTS: 20, CommonFields.CASES: 2, }, ("97123", pd.to_datetime("2020-04-03")): { CommonFields.NEGATIVE_TESTS: 26, CommonFields.POSITIVE_TESTS: 4, CommonFields.TOTAL_TESTS: 30, CommonFields.CASES: 4, }, } assert provenance.to_dict() == { ("97123", CommonFields.NEGATIVE_TESTS): "none", ("97123", CommonFields.POSITIVE_TESTS): "none;missing_pos", }
def test_update_nytimes_virgin_islands(): updater = NYTimesUpdater.make_with_data_root(DATA_ROOT) data = common_df.read_csv( io.StringIO( "county,state_full_name,aggregate_level,fips,date,cases,deaths\n" ",Virgin Islands,state,78,2020-07-31,10,1\n")).reset_index() results = updater.transform(data) expected = common_df.read_csv( io.StringIO( "country,county,state_full_name,state,aggregate_level,fips,date,cases,deaths\n" "USA,,U.S. Virgin Islands,VI,state,78,2020-07-31,10,1\n") ).reset_index() results_dict = common_test_helpers.to_dict(["state", "state_full_name"], data) expected_dict = common_test_helpers.to_dict(["state", "state_full_name"], expected) assert results_dict == expected_dict
def test_remove_duplicate_city_data(): input_df = pd.read_csv( StringIO("city,county,state,fips,date,metric_a\n" "Smithville,,ZZ,97123,2020-03-23,march23-removed\n" "Smithville,,ZZ,97123,2020-03-22,march22-kept\n" "New York City,,ZZ,97324,2020-03-22,march22-ny-patched\n" ",North County,ZZ,97001,2020-03-22,county-not-touched\n" ",North County,ZZ,97001,2020-03-23,county-not-touched\n")) output_df = update_covid_data_scraper.remove_duplicate_city_data(input_df) expected_df = pd.read_csv( StringIO( "city,county,state,fips,date,metric_a\n" "Smithville,Smithville,ZZ,97123,2020-03-22,march22-kept\n" "New York City,New York,ZZ,97324,2020-03-22,march22-ny-patched\n" ",North County,ZZ,97001,2020-03-22,county-not-touched\n" ",North County,ZZ,97001,2020-03-23,county-not-touched\n")) assert to_dict(["fips", "date"], output_df) == to_dict(["fips", "date"], expected_df)
def test_bad_county(): with structlog.testing.capture_logs() as logs, requests_mock.Mocker() as m: m.get( SOURCE_URL, text="""foo,bar date,county_name,vents 04/01,Not A County,100 """, ) transformer = CsvCopy.make_with_data_root(DATA_ROOT) df = transformer.transform() assert to_dict([CommonFields.FIPS, CommonFields.DATE], df) == {} assert [l["event"] for l in logs] == [ "Fetching URL", "Imported county name not found in FIPS data", ]
def test_bad_float(): with structlog.testing.capture_logs() as logs, requests_mock.Mocker() as m: m.get( SOURCE_URL, text="""foo,bar date,county_name,vents 04/01,Carson City,#REF! """, ) transformer = CsvCopy.make_with_data_root(DATA_ROOT) df = transformer.transform() assert to_dict([CommonFields.FIPS, CommonFields.DATE], df) == { ("32510", "2020-04-01"): { "county": "Carson City" }, } assert [l["event"] for l in logs] == ["Fetching URL", "Dropping value not a float"]
def test_multi_region_get_counties(): ts = timeseries.MultiRegionTimeseriesDataset.from_csv( io.StringIO( "location_id,county,aggregate_level,date,m1,m2\n" "iso1:us#fips:97111,Bar County,county,2020-04-02,2,\n" "iso1:us#fips:97111,Bar County,county,2020-04-03,3,\n" "iso1:us#fips:97222,Foo County,county,2020-04-01,,10\n" "iso1:us#fips:97,Great State,state,2020-04-01,1,2\n" "iso1:us#fips:97111,Bar County,county,,3,\n" "iso1:us#fips:97222,Foo County,county,,,10\n" "iso1:us#fips:97,Great State,state,,1,2\n" ) ) counties_ts = ts.get_counties(after=pd.to_datetime("2020-04-01")) assert to_dict(["fips", "date"], counties_ts.data[["fips", "date", "m1"]]) == { ("97111", pd.to_datetime("2020-04-02")): {"m1": 2}, ("97111", pd.to_datetime("2020-04-03")): {"m1": 3}, }
def test_skip_gaps_in_cases_and_deaths_metrics(): csv_string_io = io.StringIO( "location_id,country,state,county,aggregate_level,date,cases,deaths\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-01,10,1\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-02,,2\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-03,30,\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,2020-04-04,40,4\n" "iso1:us#fips:97111,US,ZZ,Bar County,county,,40,4\n") input_dataset = MultiRegionTimeseriesDataset.from_csv(csv_string_io) df = WhitelistGenerator().generate_whitelist(input_dataset) assert to_dict(["fips"], df) == { "97111": { "state": "ZZ", "county": "Bar County", "inference_ok": False }, }
def test_nha_basic(): with structlog.testing.capture_logs() as logs, requests_mock.Mocker() as m: m.get( SOURCE_URL, text="""foo,bar date,county_name,vents 04/01,Carson City,300 04/02,Clark,200 """, ) transformer = CsvCopy.make_with_data_root(DATA_ROOT) df = transformer.transform() assert to_dict([CommonFields.FIPS, CommonFields.DATE], df) == { ("32510", "2020-04-01"): { "county": "Carson City", "vents": "300" }, ("32003", "2020-04-02"): { "county": "Clark County", "vents": "200" }, } assert [l["event"] for l in logs] == ["Fetching URL"]