def test_str(self): """Test that the string representation contains all information.""" report = ValidationReport([("good", "2020-10-05")]) report.increment_total_checks() report.increment_total_checks() report.increment_total_checks() report.add_raised_warning(ImportWarning("wrong import")) report.add_raised_warning(ImportWarning("right import")) report.add_raised_error(self.ERROR_1) report.add_raised_error(self.ERROR_2) assert str(report) == "3 checks run\n1 checks failed\n1 checks suppressed\n2 warnings\n"\ "(('bad', datetime.date(2020, 11, 18)), 'exp 2', 'msg 2')\nwrong import\nright import\n"
def test_more_than_two_copies(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([["a", "1"], ["b", "2"], ["b", "2"], ["b", "2"]]) validator.check_duplicate_rows(df, "file", report) assert len(report.raised_warnings) == 1 assert report.raised_warnings[0].expression == [2, 3]
def test_duplicate_dates(self): params = { "data_source": "", "span_length": 1, "end_date": "2020-09-02", "expected_lag": {} } validator = StaticValidator(params) report = ValidationReport([]) filenames = [("20200901_county_signal_signal.csv", "match_obj"), ("20200903_county_signal_signal.csv", "match_obj"), ("20200903_usa_signal_signal.csv", "match_obj"), ("20200903_usa_signal_signal.csv", "match_obj")] validator.check_missing_date_files(filenames, report) assert len(report.raised_errors) == 1 assert "check_missing_date_files" in [ err.check_data_id[0] for err in report.raised_errors ] assert len([ err.expression[0] for err in report.raised_errors if err.check_data_id[0] == "check_missing_date_files" ]) == 1 assert [ err.expression[0] for err in report.raised_errors if err.check_data_id[0] == "check_missing_date_files" ][0] == datetime.strptime("20200902", "%Y%m%d").date()
def test_1000x_val(self): validator = DynamicValidator(self.params) report = ValidationReport([]) test_data = { "val": [1, 1, 1, 2000, 0, 1], "se": [np.nan] * 6, "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6 } ref_data = { "val": [1, 1, 1, 2, 0, 1], "se": [np.nan] * 6, "sample_size": [np.nan] * 6, "geo_id": ["1"] * 6 } test_df = pd.DataFrame(test_data) ref_df = pd.DataFrame(ref_data) validator.check_avg_val_vs_reference( test_df, ref_df, datetime.combine(date.today(), datetime.min.time()), "geo", "signal", report) assert len(report.raised_errors) == 1 assert "check_test_vs_reference_avg_changed" in [ err.check_data_id[0] for err in report.raised_errors ]
def test_empty_df(self): validator = StaticValidator(self.params) report = ValidationReport([]) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_format(empty_df, "name", "county", report) assert len(report.raised_errors) == 0
def test_gt_max_pct(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([1e7], columns=["val"]) validator.check_bad_val(df, "name", "pct", report) assert len(report.raised_errors) == 1 assert "check_val_pct_gt_100" in report.raised_errors[0].check_data_id
def test_non_consecutive_duplicates(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([["a", "1"], ["b", "2"], ["a", "1"]]) validator.check_duplicate_rows(df, "file", report) assert len(report.raised_warnings) == 1 assert report.raised_warnings[0].expression == [2] assert report.raised_warnings[0].check_data_id[1] == "file"
def test_lt_0(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([-5], columns=["val"]) validator.check_bad_val(df, "name", "signal", report) assert len(report.raised_errors) == 1 assert "check_val_lt_0" in report.raised_errors[0].check_data_id
def test_empty_df(self): validator = StaticValidator(self.params) report = ValidationReport([]) empty_df = pd.DataFrame(columns=["val"]) validator.check_bad_val(empty_df, "", "", report) validator.check_bad_val(empty_df, "", "prop", report) validator.check_bad_val(empty_df, "", "pct", report) assert len(report.raised_errors) == 0
def test_same_df(self): validator = DynamicValidator(self.params) report = ValidationReport([]) test_df = pd.DataFrame([date.today()] * 5, columns=["time_value"]) ref_df = pd.DataFrame([date.today()] * 5, columns=["time_value"]) validator.check_rapid_change_num_rows(test_df, ref_df, date.today(), "geo", "signal", report) assert len(report.raised_errors) == 0
def test_empty_filelist(self): params = { "data_source": "", "span_length": 8, "end_date": "2020-09-09", "expected_lag": {} } validator = StaticValidator(params) report = ValidationReport([]) report = ValidationReport([]) filenames = list() validator.check_missing_date_files(filenames, report) assert len(report.raised_errors) == 1 assert "check_missing_date_files" in [ err.check_data_id[0] for err in report.raised_errors ] assert len(report.raised_errors[0].expression) == 9
def test_zero_outlier(self): validator = DynamicValidator(self.params) report = ValidationReport([]) ref_val = [ 30, 30.28571429, 30.57142857, 30.85714286, 31.14285714, 31.42857143, 31.71428571, 32, 32, 32.14285714, 32.28571429, 32.42857143, 32.57142857, 32.71428571, 32.85714286, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33.28571429, 33.57142857, 33.85714286, 34.14285714 ] test_val = [0, 0, 0] ref_data = { "val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["1"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-23") } test_data = { "val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["1"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26") } ref_data2 = { "val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["2"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-23") } test_data2 = { "val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["2"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26") } ref_df = pd.concat([pd.DataFrame(ref_data), pd.DataFrame(ref_data2)]). \ reset_index(drop=True) test_df = pd.concat([pd.DataFrame(test_data), pd.DataFrame(test_data2)]). \ reset_index(drop=True) validator.check_positive_negative_spikes(test_df, ref_df, "state", "signal", report) assert len(report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ err.check_data_id[0] for err in report.raised_errors ]
def test_source_api_overlap(self): validator = DynamicValidator(self.params) report = ValidationReport([]) #Data from 51580 between 9/24 and 10/26 (10/25 query date) ref_val = [ 30, 30.28571429, 30.57142857, 30.85714286, 31.14285714, 31.42857143, 31.71428571, 32, 32, 32.14285714, 32.28571429, 32.42857143, 32.57142857, 32.71428571, 32.85714286, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33 ] test_val = [100, 100, 100] ref_data = { "val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["1"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-26") } test_data = { "val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["1"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26") } ref_data2 = { "val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["2"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-26") } test_data2 = { "val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["2"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26") } ref_df = pd.concat([pd.DataFrame(ref_data), pd.DataFrame(ref_data2)]). \ reset_index(drop=True) test_df = pd.concat([pd.DataFrame(test_data), pd.DataFrame(test_data2)]). \ reset_index(drop=True) validator.check_positive_negative_spikes(test_df, ref_df, "state", "signal", report) assert len(report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ err.check_data_id[0] for err in report.raised_errors ]
def test_uppercase_geo_id(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["ak", "AK"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "state", report) assert len(report.raised_errors) == 0 assert len(report.raised_warnings) == 1 assert "check_geo_id_lowercase" in report.raised_warnings[ 0].check_data_id assert "AK" in report.raised_warnings[0].expression
def test_invalid_geo_id_msa(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["0", "54321", "123", ".0000", "abc12"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "msa", report) assert len(report.raised_errors) == 1 assert "check_geo_id_format" in report.raised_errors[0].check_data_id assert len(report.raised_errors[0].expression) == 2 assert "54321" not in report.raised_errors[0].expression
def test_invalid_geo_id_national(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["us", "zz"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "national", report) assert len(report.raised_errors) == 1 assert "check_bad_geo_id_value" in report.raised_errors[ 0].check_data_id assert len(report.raised_errors[0].expression) == 1 assert "us" not in report.raised_errors[0].expression assert "zz" in report.raised_errors[0].expression
def test_neg_outlier(self): validator = DynamicValidator(self.params) report = ValidationReport([]) ref_val = [ 100, 101, 100, 101, 100, 100, 100, 100, 100, 100, 100, 102, 100, 100, 100, 100, 100, 101, 100, 100, 100, 100, 100, 99, 100, 100, 98, 100, 100, 100 ] test_val = [10, 10, 10] ref_data = { "val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["1"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-23") } test_data = { "val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["1"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26") } ref_data2 = { "val": ref_val, "se": [np.nan] * len(ref_val), "sample_size": [np.nan] * len(ref_val), "geo_id": ["2"] * len(ref_val), "time_value": pd.date_range(start="2020-09-24", end="2020-10-23") } test_data2 = { "val": test_val, "se": [np.nan] * len(test_val), "sample_size": [np.nan] * len(test_val), "geo_id": ["2"] * len(test_val), "time_value": pd.date_range(start="2020-10-24", end="2020-10-26") } ref_df = pd.concat([pd.DataFrame(ref_data), pd.DataFrame(ref_data2)]). \ reset_index(drop=True) test_df = pd.concat([pd.DataFrame(test_data), pd.DataFrame(test_data2)]). \ reset_index(drop=True) validator.check_positive_negative_spikes(test_df, ref_df, "state", "signal", report) assert len(report.raised_errors) == 1 assert "check_positive_negative_spikes" in [ err.check_data_id[0] for err in report.raised_errors ]
def test_lt_min_missing_not_allowed(self): validator = StaticValidator(self.params) report = ValidationReport([]) validator.params.missing_sample_size_allowed = False df = pd.DataFrame([[1, 0, 10], [1, np.nan, 240], [1, np.nan, 245]], columns=["val", "se", "sample_size"]) validator.check_bad_sample_size(df, "name", report) assert len(report.raised_errors) == 1 assert "check_n_gt_min" in [ err.check_data_id[0] for err in report.raised_errors ]
def test_invalid_geo_id_msa(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["10180", "88888", "99999"], columns=["geo_id"]) validator.check_bad_geo_id_value(df, "name", "msa", report) assert len(report.raised_errors) == 1 assert "check_bad_geo_id_value" in report.raised_errors[ 0].check_data_id assert len(report.raised_errors[0].expression) == 2 assert "10180" not in report.raised_errors[0].expression assert "88888" in report.raised_errors[0].expression assert "99999" in report.raised_errors[0].expression
def test_invalid_geo_id_hrr(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["1", "12", "123", "1234", "12345", "a", ".", "ab1"], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "hrr", report) assert len(report.raised_errors) == 1 assert "check_geo_id_format" in report.raised_errors[0].check_data_id assert len(report.raised_errors[0].expression) == 5 assert "1" not in report.raised_errors[0].expression assert "12" not in report.raised_errors[0].expression assert "123" not in report.raised_errors[0].expression
def test_invalid_geo_id_state(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame(["aa", "hi", "HI", "hawaii", "Hawaii", "a", "H.I."], columns=["geo_id"]) validator.check_bad_geo_id_format(df, "name", "state", report) assert len(report.raised_errors) == 1 assert "check_geo_id_format" in report.raised_errors[0].check_data_id assert len(report.raised_errors[0].expression) == 4 assert "aa" not in report.raised_errors[0].expression assert "hi" not in report.raised_errors[0].expression assert "HI" not in report.raised_errors[0].expression
def test_empty_df(self): validator = StaticValidator(self.params) report = ValidationReport([]) empty_df = pd.DataFrame(columns=["val", "se", "sample_size"], dtype=float) validator.check_bad_sample_size(empty_df, "", report) assert len(report.raised_errors) == 0 validator.params.missing_sample_size_allowed = True validator.check_bad_sample_size(empty_df, "", report) assert len(report.raised_errors) == 0
def test_invalid_geo_type(self): validator = StaticValidator(self.params) report = ValidationReport([]) empty_df = pd.DataFrame(columns=["geo_id"], dtype=str) validator.check_bad_geo_id_format(empty_df, "name", "hello", report) assert len(report.raised_errors) == 1 assert "check_geo_type" in [ err.check_data_id[0] for err in report.raised_errors ] assert [ err.expression for err in report.raised_errors if err.check_data_id[0] == "check_geo_type" ][0] == "hello"
def test_0_vs_many(self): validator = DynamicValidator(self.params) report = ValidationReport([]) time_value = datetime.combine(date.today(), datetime.min.time()) test_df = pd.DataFrame([time_value] * 5, columns=["time_value"]) ref_df = pd.DataFrame([time_value] * 1, columns=["time_value"]) validator.check_rapid_change_num_rows(test_df, ref_df, time_value, "geo", "signal", report) assert len(report.raised_errors) == 1 assert "check_rapid_change_num_rows" in [ err.check_data_id[0] for err in report.raised_errors ]
def test_jeffreys(self): validator = StaticValidator(self.params) report = ValidationReport([]) validator.params.missing_se_allowed = False df = pd.DataFrame([[0, 0, 200], [1, 0, np.nan], [1, np.nan, np.nan]], columns=["val", "se", "sample_size"]) validator.check_bad_se(df, "name", report) assert len(report.raised_errors) == 2 assert "check_se_not_missing_and_in_range" in [ err.check_data_id[0] for err in report.raised_errors ] assert "check_se_0_when_val_0" in [ err.check_data_id[0] for err in report.raised_errors ]
def test_same_day(self): params = { "data_source": "", "span_length": 0, "end_date": "2020-09-01", "expected_lag": {} } validator = StaticValidator(params) report = ValidationReport([]) filenames = [("20200901_county_signal_signal.csv", "match_obj")] validator.check_missing_date_files(filenames, report) assert len(report.raised_errors) == 0 assert "check_missing_date_files" not in [ err.check_data_id[0] for err in report.raised_errors ]
def test_add_raised_suppressed_error(self): """Test that an supressed error does not show up in the unsuppressed error list.""" report = ValidationReport([("good", "2020-10-05")]) report.add_raised_error(self.ERROR_1) assert len(report.unsuppressed_errors) == 0 assert report.num_suppressed == 1 assert len(report.errors_to_suppress) == 0 # Each error can only be surpressed once. report.add_raised_error(self.ERROR_1) assert report.unsuppressed_errors == [self.ERROR_1]
def test_same_n(self): validator = DynamicValidator(self.params) report = ValidationReport([]) data = { "val": [np.nan] * 6, "se": [np.nan] * 6, "sample_size": [1, 1, 1, 2, 0, 1], "geo_id": ["1"] * 6 } test_df = pd.DataFrame(data) ref_df = pd.DataFrame(data) validator.check_avg_val_vs_reference(test_df, ref_df, date.today(), "geo", "signal", report) assert len(report.raised_errors) == 0
def test_add_raised_unsuppressed_error(self): """Test that an unsupressed error shows up in the unsuppressed error list.""" report = ValidationReport([("bad", "2020-10-05")]) report.add_raised_error(self.ERROR_1) report.add_raised_error(self.ERROR_2) assert report.unsuppressed_errors == [self.ERROR_1, self.ERROR_2]
def test_single_column_duplicates_but_not_row(self): validator = StaticValidator(self.params) report = ValidationReport([]) df = pd.DataFrame([["a", "1"], ["a", "2"], ["b", "2"]]) validator.check_duplicate_rows(df, "file", report) assert len(report.raised_warnings) == 0