def test_equality(self): pc1 = PatternCreator() pc2 = PatternCreator() pc2.add("ETYPE", "hello") assert pc1 == pc1 assert pc1 != "wrong type" assert pc1 != pc2 # different columns
def test_errors(self): pc = PatternCreator() with pytest.raises(TypeError): pc.add("ETYPE", 234324) # invalid type pc.add("ETYPE", "hello") with pytest.raises(ValueError): pc.add("ETYPE", "hello", check_exists=True) # duplicate with pytest.raises(ValueError): pc.add("etype", [{"a": 1, "b": 2}]) # wrong contents
def test_row2raw(self): # unsupported value_type - eval fails with pytest.raises(NameError): PatternCreator.row2raw( pd.Series( { "label": "et1", "attribute_0": "TEXT", "value_0": "aaa", "value_type_0": "wrong_type", "op_0": "", } ) ) # already the first token is invalid with pytest.raises(ValueError): PatternCreator.row2raw( pd.Series( { "label": "et1", "attribute_0": np.nan, "value_0": "aaa", "value_type_0": "wrong_type", "op_0": "", } ) ) res = PatternCreator.row2raw( pd.Series( { "label": "et1", "attribute_0": "TEXT", "value_0": "aaa", "value_type_0": "str", "op_0": "", "attribute_1": np.nan, "value_1": "bbb", "value_type_1": "int", "op_1": "!", } ) ) assert res == {"label": "et1", "pattern": [{"TEXT": "aaa"}]}
def test_raw2row(self): # pattern not a list with pytest.raises(TypeError): PatternCreator.raw2row({"label": "ET1", "pattern": {"LOWER": "TEXT"}}) # label not a str with pytest.raises(TypeError): PatternCreator.raw2row({"label": 232, "pattern": [{"LOWER": "TEXT"}]}) # element not dictionary with pytest.raises(TypeError): PatternCreator.raw2row({"label": "etype", "pattern": [11]})
def test_to_df(self): pc = PatternCreator() pc.add("ET1", "hello") pc.add("ET1", "there") df_1 = pc.to_df() df_2 = pc.to_df() df_2.loc[0, "label"] = "REPLACED_LABEL" df_3 = pc.to_df() assert not df_1.equals(df_2) assert df_1.equals(df_3)
def test_call(self): pc = PatternCreator() pc.add("new_entity_type", "tall") text = "I saw a tall building." doc = pc(text) assert len(doc.ents) == 1 assert list(doc.ents)[0].label_ == "new_entity_type" pc.drop(0) doc2 = pc(text) assert len(doc2.ents) == 0
def test_overall(self, tmpdir): tmpdir_p = pathlib.Path(str(tmpdir)) / "patterns.json" pc = PatternCreator() assert len(pc.to_df()) == 0 pc.add("NEW_ENTITY_TYPE", "cake") assert len(pc.to_df()) == 1 assert set(pc.to_df().columns) == { "label", "attribute_0", "value_0", "value_type_0", "op_0", } pc.add("COOL_ENTITY_TYPE", {"LEMMA": "pancake", "OP": "*"}) assert len(pc.to_df()) == 2 pc.add("SOME_ENTITY_TYPE", [{"TEXT": "good"}, {"TEXT": "pizza"}]) assert len(pc.to_df()) == 3 assert set(pc.to_df().columns) == { "label", "attribute_0", "value_0", "value_type_0", "op_0", "attribute_1", "value_1", "value_type_1", "op_1", } pc.to_jsonl(tmpdir_p) pc_loaded = PatternCreator.from_jsonl(tmpdir_p) pc_manual = PatternCreator(storage=pc.to_df()) assert pc == pc_loaded == pc_manual
def test_to_list(self): pc = PatternCreator() pc.add("ET1", "hello") pc.add("ET2", {"TEXT": "there"}) pc.add("ET3", [{"TEXT": {"IN": ["world", "cake"]}}]) pc.add("ET4", [{"TEXT": {"IN": ["aa", "bbb"]}}, {"TEXT": {"REGEX": "^s"}}]) res = pc.to_list() assert len(res) == 4
def test_drop(self): pc = PatternCreator() pc.add("ET1", "hello") pc.add("ET1", "there") pc.add("ET2", "world") pc.add("ET4", "dog") assert pc.to_df().index.to_list() == [0, 1, 2, 3] pc.drop([1, 2]) assert pc.to_df().index.to_list() == [0, 1]
def test_raw2row2raw(self, raw): assert raw == PatternCreator.row2raw(PatternCreator.raw2row(raw))