def test_provenance(): region_as = Region.from_state("AS") region_tx = Region.from_state("TX") metrics_as = { CommonFields.POSITIVE_TESTS: TimeseriesLiteral([0, 2, 4, 6], provenance="pt_src1"), CommonFields.TOTAL_TESTS: [100, 200, 300, 400], } metrics_tx = { CommonFields.POSITIVE_TESTS: TimeseriesLiteral([1, 2, 3, 4], provenance="pt_src2"), CommonFields.POSITIVE_TESTS_VIRAL: TimeseriesLiteral( [10, 20, 30, 40], provenance="pos_viral" ), CommonFields.TOTAL_TESTS: [100, 200, 300, 400], } dataset_in = test_helpers.build_dataset({region_as: metrics_as, region_tx: metrics_tx}) methods = [ DivisionMethod( DatasetName("method1"), CommonFields.POSITIVE_TESTS_VIRAL, CommonFields.TOTAL_TESTS ), DivisionMethod( DatasetName("method2"), CommonFields.POSITIVE_TESTS, CommonFields.TOTAL_TESTS ), ] all_methods = AllMethods.run(dataset_in, methods, diff_days=3) expected_as = {CommonFields.TEST_POSITIVITY: TimeseriesLiteral([0.02], provenance=["pt_src1"])} expected_tx = {CommonFields.TEST_POSITIVITY: TimeseriesLiteral([0.1], provenance="pos_viral")} expected_positivity = test_helpers.build_dataset( {region_as: expected_as, region_tx: expected_tx}, start_date="2020-04-04" ) test_helpers.assert_dataset_like(all_methods.test_positivity, expected_positivity)
def test_tail_filter_stalled_timeseries(): # Make a timeseries that has 24 days increasing. values_increasing = list(range(100_000, 124_000, 1_000)) # Add 4 days that copy the 24th day. The filter is meant to remove these. values_stalled = values_increasing + [values_increasing[-1]] * 4 assert len(values_stalled) == 28 ds_in = test_helpers.build_default_region_dataset({CommonFields.NEW_CASES: values_stalled}) tail_filter, ds_out = TailFilter.run(ds_in, [CommonFields.NEW_CASES]) _assert_tail_filter_counts(tail_filter, truncated=1) tag_content = ( "Removed 4 observations that look suspicious compared to mean diff of 1000.0 a few weeks " "ago." ) truncated_timeseries = test_helpers.TimeseriesLiteral( values_increasing, annotation=[ test_helpers.make_tag( TagType.CUMULATIVE_TAIL_TRUNCATED, date="2020-04-24", original_observation=123_000.0 ) ], ) ds_expected = test_helpers.build_default_region_dataset( {CommonFields.NEW_CASES: truncated_timeseries} ) test_helpers.assert_dataset_like(ds_out, ds_expected) # Try again with one day less, not enough for the filter so it returns the data unmodified. ds_in = test_helpers.build_default_region_dataset({CommonFields.NEW_CASES: values_stalled[:-1]}) tail_filter, ds_out = TailFilter.run(ds_in, [CommonFields.NEW_CASES]) _assert_tail_filter_counts(tail_filter, skipped_too_short=1) test_helpers.assert_dataset_like(ds_out, ds_in)
def test_tail_filter_zero_diff(): # Make sure constant value timeseries is not truncated. values = [100_000] * 28 ds_in = test_helpers.build_default_region_dataset({CommonFields.CASES: values}) tail_filter, ds_out = TailFilter.run(ds_in, [CommonFields.CASES]) _assert_tail_filter_counts(tail_filter, all_good=1) test_helpers.assert_dataset_like(ds_out, ds_in, drop_na_dates=True)
def test_tail_filter_small_diff(stall_count: int): # Make sure a zero increase in the most recent value(s) of a series that was increasing # slowly is not dropped. values = list(range(1_000, 1_030)) + [1_029] * stall_count ds_in = test_helpers.build_default_region_dataset({CommonFields.CASES: values}) tail_filter, ds_out = TailFilter.run(ds_in, [CommonFields.CASES]) _assert_tail_filter_counts(tail_filter, all_good=1) test_helpers.assert_dataset_like(ds_out, ds_in, drop_na_dates=True)
def test_tail_filter_mean_nan(): # Make a timeseries that has 14 days of NaN, than 14 days of increasing values. The first # 100_000 is there so the NaN form a gap that isn't dropped by unrelated code. values = [100_000] + [float("NaN")] * 14 + list(range(100_000, 114_000, 1_000)) assert len(values) == 29 ds_in = test_helpers.build_default_region_dataset({CommonFields.NEW_CASES: values}) tail_filter, ds_out = TailFilter.run(ds_in, [CommonFields.NEW_CASES]) _assert_tail_filter_counts(tail_filter, skipped_na_mean=1) test_helpers.assert_dataset_like(ds_out, ds_in, drop_na_dates=True)
def test_tail_filter_diff_goes_negative(): # The end of this timeseries is (in 1000s) ... 127, 126, 127, 127. Ony the last 127 is # expected to be truncated. values = list(range(100_000, 128_000, 1_000)) + [126_000, 127_000, 127_000] assert len(values) == 31 ds_in = test_helpers.build_default_region_dataset({CommonFields.CASES: values}) tail_filter, ds_out = TailFilter.run(ds_in, [CommonFields.CASES]) ds_expected = test_helpers.build_default_region_dataset({CommonFields.CASES: values[:-1]}) _assert_tail_filter_counts(tail_filter, truncated=1) test_helpers.assert_dataset_like(ds_out, ds_expected, drop_na_dates=True, compare_tags=False)
def test_tail_filter_long_stall(stall_count: int, annotation_type: TagType): # This timeseries has stalled for a long time. values = list(range(100_000, 128_000, 1_000)) + [127_000] * stall_count assert len(values) == 28 + stall_count ds_in = test_helpers.build_default_region_dataset({CommonFields.CASES: values}) tail_filter, ds_out = TailFilter.run(ds_in, [CommonFields.CASES]) # There are never more than 13 stalled observations removed. ds_expected = test_helpers.build_default_region_dataset( {CommonFields.CASES: values[: -min(stall_count, 14)]} ) if annotation_type is TagType.CUMULATIVE_TAIL_TRUNCATED: _assert_tail_filter_counts(tail_filter, truncated=1) elif annotation_type is TagType.CUMULATIVE_LONG_TAIL_TRUNCATED: _assert_tail_filter_counts(tail_filter, long_truncated=1) test_helpers.assert_dataset_like(ds_out, ds_expected, drop_na_dates=True, compare_tags=False)
def test_recent_pos_neg_tests_has_positivity_ratio(pos_neg_tests_recent): # positive_tests and negative_tests appear on 8/10 and 8/11. They will be used when # that is within 10 days of 'today'. dataset_in = test_helpers.build_default_region_dataset( { CommonFields.TEST_POSITIVITY_7D: TimeseriesLiteral( [0.02, 0.03, 0.04, 0.05, 0.06, 0.07], provenance="CDCTesting" ), CommonFields.POSITIVE_TESTS: TimeseriesLiteral( [1, 2, None, None, None, None], provenance="pos" ), CommonFields.NEGATIVE_TESTS: [10, 20, None, None, None, None], }, start_date="2020-08-10", ) if pos_neg_tests_recent: freeze_date = "2020-08-21" # positive_tests and negative_tests are used expected_metrics = { CommonFields.TEST_POSITIVITY: TimeseriesLiteral( [pd.NA, 0.0909, pd.NA, pd.NA, pd.NA, pd.NA], provenance="pos" ) } expected = test_helpers.build_default_region_dataset( expected_metrics, start_date="2020-08-10" ) else: freeze_date = "2020-08-22" # positive_tests and negative_tests no longer recent so test_positivity_7d is copied to # output. expected_metrics = { CommonFields.TEST_POSITIVITY: TimeseriesLiteral( [0.02, 0.03, 0.04, 0.05, 0.06, 0.07], provenance="CDCTesting" ) } expected = test_helpers.build_default_region_dataset( expected_metrics, start_date="2020-08-10" ) with freeze_time(freeze_date): all_methods = AllMethods.run(dataset_in) # check_less_precise so only 3 digits need match for testPositivityRatio test_helpers.assert_dataset_like(all_methods.test_positivity, expected, check_less_precise=True)
def test_update_and_load(tmp_path: pathlib.Path, nyc_fips, nyc_region): # restricting the datasets being persisted to one county to speed up tests a bit. multiregion_timeseries_nyc = combined_datasets.load_us_timeseries_dataset().get_regions_subset( [nyc_region] ) one_region_nyc = multiregion_timeseries_nyc.get_one_region(nyc_region) assert one_region_nyc.latest[CommonFields.POPULATION] > 1_000_000 assert one_region_nyc.region.location_id combined_dataset_utils.persist_dataset( multiregion_timeseries_nyc, tmp_path, ) timeseries_loaded = combined_datasets.load_us_timeseries_dataset(pointer_directory=tmp_path) one_region_loaded = timeseries_loaded.get_one_region(nyc_region) assert one_region_nyc.latest == pytest.approx(one_region_loaded.latest) test_helpers.assert_dataset_like( timeseries_loaded, multiregion_timeseries_nyc, drop_na_timeseries=True )
def test_preserve_tags(): region_as = Region.from_state("AS") region_tx = Region.from_state("TX") tag1 = test_helpers.make_tag(type=TagType.CUMULATIVE_LONG_TAIL_TRUNCATED, date="2020-04-04") tag2 = test_helpers.make_tag(type=TagType.CUMULATIVE_TAIL_TRUNCATED, date="2020-04-04") tag_drop = test_helpers.make_tag(type=TagType.ZSCORE_OUTLIER, date="2020-04-01") tag3 = test_helpers.make_tag(type=TagType.ZSCORE_OUTLIER, date="2020-04-04") tag4 = test_helpers.make_tag(type=TagType.ZSCORE_OUTLIER, date="2020-04-03") metrics_as = { CommonFields.POSITIVE_TESTS: TimeseriesLiteral( [1, 2, 3, 4], annotation=[tag1], provenance="pos" ), CommonFields.TOTAL_TESTS: TimeseriesLiteral([100, 200, 300, 400], annotation=[tag2]), } metrics_tx = { CommonFields.POSITIVE_TESTS: TimeseriesLiteral([None, None, 3, 4], annotation=[tag_drop]), CommonFields.POSITIVE_TESTS_VIRAL: [10, 20, 30, 40], CommonFields.TOTAL_TESTS: TimeseriesLiteral([100, 200, 300, 400], annotation=[tag3, tag4]), } dataset_in = test_helpers.build_dataset({region_as: metrics_as, region_tx: metrics_tx}) methods = [ DivisionMethod( DatasetName("method1"), CommonFields.POSITIVE_TESTS, CommonFields.TOTAL_TESTS ), DivisionMethod( DatasetName("method2"), CommonFields.POSITIVE_TESTS_VIRAL, CommonFields.TOTAL_TESTS ), ] all_methods = AllMethods.run(dataset_in, methods, diff_days=3) expected_as = { CommonFields.TEST_POSITIVITY: TimeseriesLiteral( [0.01], provenance="pos", annotation=[tag1, tag2] ) } expected_tx = {CommonFields.TEST_POSITIVITY: TimeseriesLiteral([0.1], annotation=[tag3, tag4])} expected_positivity = test_helpers.build_dataset( {region_as: expected_as, region_tx: expected_tx}, start_date="2020-04-04" ) test_helpers.assert_dataset_like(all_methods.test_positivity, expected_positivity)
def test_basic(): region_tx = Region.from_state("TX") region_sf = Region.from_fips("06075") region_hi = Region.from_state("HI") # Add a timeseries with a tag to make sure they are preserved. ts_with_tag = TimeseriesLiteral( [0, 0, 0], annotation=[test_helpers.make_tag(date="2020-04-01")]) ds_in = test_helpers.build_dataset({ region_tx: { CommonFields.VACCINES_DISTRIBUTED: [0, 0, 0] }, region_sf: { CommonFields.VACCINES_DISTRIBUTED: [0, 0, 1] }, region_hi: { CommonFields.VACCINES_DISTRIBUTED: [0, 0, None], CommonFields.CASES: ts_with_tag, }, }) with structlog.testing.capture_logs() as logs: ds_out = zeros_filter.drop_all_zero_timeseries( ds_in, [CommonFields.VACCINES_DISTRIBUTED]) ds_expected = test_helpers.build_dataset({ region_sf: { CommonFields.VACCINES_DISTRIBUTED: [0, 0, 1] }, region_hi: { CommonFields.CASES: ts_with_tag }, }) log = more_itertools.one(logs) assert log["event"] == zeros_filter.DROPPING_TIMESERIES_WITH_ONLY_ZEROS assert pd.MultiIndex.from_tuples([ (region_hi.location_id, CommonFields.VACCINES_DISTRIBUTED), (region_tx.location_id, CommonFields.VACCINES_DISTRIBUTED), ]).equals(log["dropped"]) test_helpers.assert_dataset_like(ds_expected, ds_out)
def test_default_positivity_methods(): # This test intentionally doesn't pass any methods to AllMethods.run to run the methods used # in production. region_as = Region.from_state("AS") region_tx = Region.from_state("TX") metrics_as = { CommonFields.POSITIVE_TESTS: TimeseriesLiteral([0, 1, 2, 3, 4, 5, 6, 7], provenance="src1"), CommonFields.NEGATIVE_TESTS: TimeseriesLiteral( [10, 19, 28, 37, 46, 55, 64, 73], provenance="src1" ), } metrics_tx = { CommonFields.POSITIVE_TESTS_VIRAL: TimeseriesLiteral( [2, 4, 6, 8, 10, 12, 14, 16], provenance="pos_tests" ), CommonFields.TOTAL_TESTS_VIRAL: [10, 20, 30, 40, 50, 60, 70, 80], } dataset_in = test_helpers.build_dataset({region_as: metrics_as, region_tx: metrics_tx}) # TODO(tom): Once test positivity code seems stable remove call to datetime.today() in # has_recent_data and remove this freeze_time. with freeze_time("2020-04-14"): all_methods = AllMethods.run(dataset_in, diff_days=1) expected_as = { CommonFields.TEST_POSITIVITY: TimeseriesLiteral( [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], provenance="src1", ) } expected_tx = { CommonFields.TEST_POSITIVITY: TimeseriesLiteral( [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2], provenance="pos_tests" ) } expected_positivity = test_helpers.build_dataset( {region_as: expected_as, region_tx: expected_tx}, start_date="2020-04-02", ) test_helpers.assert_dataset_like(all_methods.test_positivity, expected_positivity)
def test_tail_filter_two_series(): # Check that two series are both filtered. Currently the 'good' dates of 14-28 days ago are # relative to the most recent date of any timeseries but maybe it should be per-timeseries. pos_tests = list(range(100_000, 128_000, 1_000)) tot_tests = list(range(1_000_000, 1_280_000, 10_000)) # Pad positive tests with two 'None's so the timeseries are the same length. pos_tests_stalled = pos_tests + [pos_tests[-1]] * 3 + [None] * 2 tot_tests_stalled = tot_tests + [tot_tests[-1]] * 5 ds_in = test_helpers.build_default_region_dataset( { CommonFields.POSITIVE_TESTS: pos_tests_stalled, CommonFields.TOTAL_TESTS: tot_tests_stalled, } ) tail_filter, ds_out = TailFilter.run( ds_in, [CommonFields.POSITIVE_TESTS, CommonFields.TOTAL_TESTS] ) ds_expected = test_helpers.build_default_region_dataset( {CommonFields.POSITIVE_TESTS: pos_tests, CommonFields.TOTAL_TESTS: tot_tests} ) _assert_tail_filter_counts(tail_filter, truncated=2) test_helpers.assert_dataset_like(ds_out, ds_expected, drop_na_dates=True, compare_tags=False)
def test_recent_days(): region_as = Region.from_state("AS") region_tx = Region.from_state("TX") metrics_as = { CommonFields.POSITIVE_TESTS: TimeseriesLiteral([0, 2, 4, 6], provenance="pos"), CommonFields.POSITIVE_TESTS_VIRAL: TimeseriesLiteral( [0, 20, None, None], provenance="pos_viral" ), CommonFields.TOTAL_TESTS: TimeseriesLiteral([100, 200, 300, 400]), } metrics_tx = { CommonFields.POSITIVE_TESTS: TimeseriesLiteral([1, 2, 3, 4], provenance="pos"), CommonFields.POSITIVE_TESTS_VIRAL: TimeseriesLiteral( [10, 20, 30, 40], provenance="pos_viral" ), CommonFields.TOTAL_TESTS: TimeseriesLiteral([100, 200, 300, 400]), } ds = test_helpers.build_dataset({region_as: metrics_as, region_tx: metrics_tx}) methods = [ DivisionMethod( DatasetName("method1"), CommonFields.POSITIVE_TESTS_VIRAL, CommonFields.TOTAL_TESTS ), DivisionMethod( DatasetName("method2"), CommonFields.POSITIVE_TESTS, CommonFields.TOTAL_TESTS ), ] methods = _replace_methods_attribute(methods, recent_days=2) all_methods = AllMethods.run(ds, methods, diff_days=1) expected_positivity = test_helpers.build_dataset( { region_as: { CommonFields.TEST_POSITIVITY: TimeseriesLiteral( [0.02, 0.02, 0.02], provenance="pos" ) }, region_tx: { CommonFields.TEST_POSITIVITY: TimeseriesLiteral( [0.1, 0.1, 0.1], provenance="pos_viral" ) }, }, start_date="2020-04-02", ) test_helpers.assert_dataset_like(all_methods.test_positivity, expected_positivity) assert all_methods.test_positivity.get_one_region(region_as).provenance == { CommonFields.TEST_POSITIVITY: ["pos"] } assert all_methods.test_positivity.get_one_region(region_tx).provenance == { CommonFields.TEST_POSITIVITY: ["pos_viral"] } methods = _replace_methods_attribute(methods, recent_days=3) all_methods = AllMethods.run(ds, methods, diff_days=1) positivity_provenance = all_methods.test_positivity.provenance assert positivity_provenance.loc["iso1:us#iso2:us-as"].to_dict() == { CommonFields.TEST_POSITIVITY: "pos_viral" } assert positivity_provenance.loc["iso1:us#iso2:us-tx"].to_dict() == { CommonFields.TEST_POSITIVITY: "pos_viral" }