Esempio n. 1
0
    def test_equality(self):
        pc1 = PatternCreator()
        pc2 = PatternCreator()
        pc2.add("ETYPE", "hello")

        assert pc1 == pc1
        assert pc1 != "wrong type"
        assert pc1 != pc2  # different columns
Esempio n. 2
0
    def test_errors(self):
        pc = PatternCreator()

        with pytest.raises(TypeError):
            pc.add("ETYPE", 234324)  # invalid type

        pc.add("ETYPE", "hello")

        with pytest.raises(ValueError):
            pc.add("ETYPE", "hello", check_exists=True)  # duplicate

        with pytest.raises(ValueError):
            pc.add("etype", [{"a": 1, "b": 2}])  # wrong contents
Esempio n. 3
0
    def test_row2raw(self):
        # unsupported value_type - eval fails
        with pytest.raises(NameError):
            PatternCreator.row2raw(
                pd.Series(
                    {
                        "label": "et1",
                        "attribute_0": "TEXT",
                        "value_0": "aaa",
                        "value_type_0": "wrong_type",
                        "op_0": "",
                    }
                )
            )

        # already the first token is invalid
        with pytest.raises(ValueError):
            PatternCreator.row2raw(
                pd.Series(
                    {
                        "label": "et1",
                        "attribute_0": np.nan,
                        "value_0": "aaa",
                        "value_type_0": "wrong_type",
                        "op_0": "",
                    }
                )
            )

        res = PatternCreator.row2raw(
            pd.Series(
                {
                    "label": "et1",
                    "attribute_0": "TEXT",
                    "value_0": "aaa",
                    "value_type_0": "str",
                    "op_0": "",
                    "attribute_1": np.nan,
                    "value_1": "bbb",
                    "value_type_1": "int",
                    "op_1": "!",
                }
            )
        )

        assert res == {"label": "et1", "pattern": [{"TEXT": "aaa"}]}
Esempio n. 4
0
    def test_raw2row(self):
        # pattern not a list
        with pytest.raises(TypeError):
            PatternCreator.raw2row({"label": "ET1", "pattern": {"LOWER": "TEXT"}})

        # label not a str
        with pytest.raises(TypeError):
            PatternCreator.raw2row({"label": 232, "pattern": [{"LOWER": "TEXT"}]})

        # element not dictionary
        with pytest.raises(TypeError):
            PatternCreator.raw2row({"label": "etype", "pattern": [11]})
Esempio n. 5
0
    def test_to_df(self):
        pc = PatternCreator()

        pc.add("ET1", "hello")
        pc.add("ET1", "there")

        df_1 = pc.to_df()
        df_2 = pc.to_df()

        df_2.loc[0, "label"] = "REPLACED_LABEL"

        df_3 = pc.to_df()

        assert not df_1.equals(df_2)
        assert df_1.equals(df_3)
Esempio n. 6
0
    def test_call(self):
        pc = PatternCreator()

        pc.add("new_entity_type", "tall")

        text = "I saw a tall building."
        doc = pc(text)
        assert len(doc.ents) == 1
        assert list(doc.ents)[0].label_ == "new_entity_type"

        pc.drop(0)

        doc2 = pc(text)

        assert len(doc2.ents) == 0
Esempio n. 7
0
    def test_overall(self, tmpdir):
        tmpdir_p = pathlib.Path(str(tmpdir)) / "patterns.json"

        pc = PatternCreator()

        assert len(pc.to_df()) == 0

        pc.add("NEW_ENTITY_TYPE", "cake")

        assert len(pc.to_df()) == 1
        assert set(pc.to_df().columns) == {
            "label",
            "attribute_0",
            "value_0",
            "value_type_0",
            "op_0",
        }

        pc.add("COOL_ENTITY_TYPE", {"LEMMA": "pancake", "OP": "*"})

        assert len(pc.to_df()) == 2

        pc.add("SOME_ENTITY_TYPE", [{"TEXT": "good"}, {"TEXT": "pizza"}])

        assert len(pc.to_df()) == 3
        assert set(pc.to_df().columns) == {
            "label",
            "attribute_0",
            "value_0",
            "value_type_0",
            "op_0",
            "attribute_1",
            "value_1",
            "value_type_1",
            "op_1",
        }

        pc.to_jsonl(tmpdir_p)
        pc_loaded = PatternCreator.from_jsonl(tmpdir_p)
        pc_manual = PatternCreator(storage=pc.to_df())

        assert pc == pc_loaded == pc_manual
Esempio n. 8
0
    def test_to_list(self):
        pc = PatternCreator()

        pc.add("ET1", "hello")
        pc.add("ET2", {"TEXT": "there"})
        pc.add("ET3", [{"TEXT": {"IN": ["world", "cake"]}}])
        pc.add("ET4", [{"TEXT": {"IN": ["aa", "bbb"]}}, {"TEXT": {"REGEX": "^s"}}])

        res = pc.to_list()

        assert len(res) == 4
Esempio n. 9
0
    def test_drop(self):
        pc = PatternCreator()

        pc.add("ET1", "hello")
        pc.add("ET1", "there")
        pc.add("ET2", "world")
        pc.add("ET4", "dog")

        assert pc.to_df().index.to_list() == [0, 1, 2, 3]

        pc.drop([1, 2])

        assert pc.to_df().index.to_list() == [0, 1]
Esempio n. 10
0
 def test_raw2row2raw(self, raw):
     assert raw == PatternCreator.row2raw(PatternCreator.raw2row(raw))