def test_validate_unique(self): unique = pd.DataFrame( [["abc", "def"], ["ghi", "jkl"], ["mnop", "qrs"]], index=["X", "Y", "Z"], columns=["col1", "col2"]) duped = pd.DataFrame([["abc", "def"], ["ghi", "qrs"], ["abc", "qrs"]], index=["X", "Y", "X"], columns=["col1", "col2"]) schema_def = {"unique": True} errors = validate._validate_column(unique.index, "index", "unique_df", schema_def) self.assertFalse(errors) errors = validate._validate_column(duped.index, "index", "duped_df", schema_def) self.assertEqual(len(errors), 1) self.assertIn("is not unique", errors[0]) errors = validate._validate_column(unique["col1"], "col1", "unique_df", schema_def) self.assertFalse(errors) errors = validate._validate_column(duped["col1"], "col1", "duped_df", schema_def) self.assertEqual(len(errors), 1) self.assertIn("is not unique", errors[0]) schema_def = {"unique": False} errors = validate._validate_column(duped["col1"], "col1", "duped_df", schema_def) self.assertFalse(errors)
def test_human_readable(self): hr_df = pd.DataFrame( [["for you, a human", "UBERON:12345", "UBERON:1234 (thundercat)"], ["hope you're well", "bit of lungs", "brain"]], index=["ENSG00001", "ENSG00002"], columns=["good", "curie", "suffixed_curie"]) schema_def = {"type": "human-readable string"} errors = validate._validate_column(hr_df["good"], "good", "hr", schema_def) self.assertFalse(errors) errors = validate._validate_column(hr_df["curie"], "curie", "hr", schema_def) self.assertEqual(len(errors), 1) self.assertIn("non-human-readable", errors[0]) errors = validate._validate_column(hr_df["suffixed_curie"], "suffixed_curie", "hr", schema_def) self.assertEqual(len(errors), 1) self.assertIn("non-human-readable", errors[0]) errors = validate._validate_column(hr_df.index, "ensg", "hr", schema_def) self.assertEqual(len(errors), 1) self.assertIn("non-human-readable", errors[0])
def test_validate_nullable(self): non_null = pd.DataFrame( [["abc", "def"], ["ghi", "jkl"], ["mnop", "qrs"]], index=["X", "Y", "Z"], columns=["col1", "col2"]) has_null = pd.DataFrame( [["abc", "", None], ["ghi", "jkl", 1], ["mnop", "qrs", 2]], index=["X", "Y", "Z"], columns=["col1", "col2", "col3"]) schema_def = {"nullable": False} errors = validate._validate_column(non_null["col1"], "col1", "nonnull_df", schema_def) self.assertFalse(errors) errors = validate._validate_column(has_null["col1"], "col1", "hasnull_df", schema_def) self.assertFalse(errors) errors = validate._validate_column(has_null["col2"], "col2", "hasnull_df", schema_def) self.assertEqual(len(errors), 1) self.assertIn("contains empty values", errors[0]) errors = validate._validate_column(has_null["col3"], "col3", "hasnull_df", schema_def) self.assertEqual(len(errors), 1) self.assertIn("contains empty values", errors[0]) schema_def = {"nullable": True} errors = validate._validate_column(has_null["col2"], "col2", "hasnull_df", schema_def) self.assertFalse(errors)
def test_curie(self): curie_df = pd.DataFrame( [[ "EFO:00001", "HsapDv:00001 (cell culture)", "EFO:", "MONDO:0001 cell culture" ], [ "UBERON:00002", "HsapDv:00002 (organoid)", "EFO:12345", "MONDO:0002 (baba yaga)" ], [ "EFO:0000000005", "HsapDv:000004 (humanzee)", "EFO:000002", "MONDO:0004 (TMNT)" ]], index=["X", "Y", "Z"], columns=["good", "good_suffix", "bad", "bad_suffix"]) # Good schema_def = {"type": "curie", "prefixes": ["EFO", "UBERON"]} errors = validate._validate_column(curie_df["good"], "good", "curie_df", schema_def) self.assertFalse(errors) # Good suffix schema_def = { "type": "suffixed curie", "prefixes": ["HsapDv", "WHATEVER"] } errors = validate._validate_column(curie_df["good_suffix"], "good_suffix", "curie_df", schema_def) self.assertFalse(errors) # Bad prefix schema_def = {"type": "curie", "prefixes": ["EFO"]} errors = validate._validate_column(curie_df["good"], "good", "curie_df", schema_def) self.assertEqual(len(errors), 1) self.assertIn("invalid ontology", errors[0]) self.assertIn("must be curies from one of these", errors[0]) # Bad curies schema_def = {"type": "curie", "prefixes": ["EFO"]} errors = validate._validate_column(curie_df["bad"], "bad", "curie_df", schema_def) self.assertEqual(len(errors), 1) self.assertIn("invalid ontology", errors[0]) # Bad suffixes schema_def = {"type": "suffixed curie", "prefixes": ["EFO"]} errors = validate._validate_column(curie_df["bad_suffix"], "bad_suffix", "curie_df", schema_def) self.assertEqual(len(errors), 1) self.assertIn("invalid ontology", errors[0])
def test_enum(self): enum_df = pd.DataFrame([["abc", "ghi"], ["def", "jkl"]], index=["X", "Y"], columns=["col1", "col2"]) # All match schema_def = {"type": "string", "enum": ["abc", "def", "xyz"]} errors = validate._validate_column(enum_df["col1"], "col1", "enum_df", schema_def) self.assertFalse(errors) # Missing value schema_def = {"type": "string", "enum": ["abc", "xyz"]} errors = validate._validate_column(enum_df["col1"], "col1", "enum_df", schema_def) self.assertEqual(len(errors), 1) self.assertIn("unpermitted values", errors[0])