Ejemplo n.º 1
0
 def __iter__(self):
     reader = read_lines_from(self.source)
     if self.hdr == True and self.n == 0:
         self.n += 1
         self.hdr = next(reader).rstrip("\n\r").split("\t")
     if self.hdr:
         self.hdr2col = {name: idx for idx, name in enumerate(self.hdr)}
     for line in reader:
         fields = line.split("\t")
         if isinstance(self.text_col, int):
             text = fields[self.text_col]
         elif callable(self.text_col):
             text = self.text_col(fields, cols=self.hdr2col, n=self.n)
         else:
             text = fields[self.hdr2col[self.text_col]]
         doc = Document(text)
         if self.feature_cols:
             if callable(self.feature_cols):
                 doc.features.update(
                     self.feature_cols(fields, cols=self.hdr2col, n=self.n)
                 )
             else:
                 for fname, colid in self.feature_cols.items():
                     if isinstance(colid, int):
                         value = fields[colid]
                     else:
                         value = fields[self.hdr2col[colid]]
                 doc.features[fname] = value
         self.n += 1
         yield doc
Ejemplo n.º 2
0
    def test_formatgatexml01(self):
        from gatenlp.document import Document

        curpath = os.path.abspath(os.path.curdir)
        tstpath = os.path.join(curpath, "tests")
        with pytest.raises(Exception) as ex:
            Document.load(source=os.path.join(tstpath, "testdoc1.xml"), fmt="gatexml")
        assert "Unsupported serialization type" in str(ex.value)
        doc = Document.load(
            source=os.path.join(tstpath, "testdoc1.xml"),
            fmt="gatexml",
            ignore_unknown_types=True,
        )
        fs = doc.features
        # print("\n!!!!!!!!!!!!!!!!!!!!!!!! FEATURES=", fs)
        assert "fInt1" in fs and fs["fInt1"] == 222
        assert "fBoolean" in fs and fs["fBoolean"] == True
        assert "fString1" in fs and fs["fString1"] == "Some string"
        assert "fLong1" in fs and fs["fLong1"] == 123
        assert "fFloat1" in fs and fs["fFloat1"] == 3.4
        anns = doc.annset()
        # print("\n!!!!!!!!!!!!!!!!!!!!!!!! ANNS=", anns)
        assert len(anns) == 2
        ann1, ann2 = list(anns)[0:2]
        assert ann1.type == "Type1"
        assert ann1.start == 0
        assert ann1.end == 4
        assert ann1.id == 0
        assert ann2.type == "Type1"
        assert ann2.start == 5
        assert ann2.end == 8
        assert ann2.id == 1
Ejemplo n.º 3
0
 def __iter__(self):
     for _, row in self.reader:
         text = row[self.text_col]
         doc = Document(text)
         if self.feature_cols:
             for fname, colname in self.feature_cols.items():
                 value = row[colname]
                 doc.features[fname] = value
         self.n += 1
         yield doc
Ejemplo n.º 4
0
def makedoc1():
    doc1 = Document(DOC1_TEXT)
    set1 = doc1.annset()
    whitespaces = [
        m for m in re.finditer(r"[\s,.!?]+|^[\s,.!?]*|[\s,.!?]*$", DOC1_TEXT)
    ]
    nrtokens = len(whitespaces) - 1
    for k in range(nrtokens):
        fromoff = whitespaces[k].end()
        tooff = whitespaces[k + 1].start()
        set1.add(fromoff, tooff, "Token")
    return doc1
Ejemplo n.º 5
0
    def test_listcorpus(self):
        docs = [Document(t) for t in TEXTS]
        lc1 = ListCorpus(docs)
        assert len(lc1) == len(docs)
        for idx, doc in enumerate(lc1):
            assert idx == doc.features["__idx"]
            assert idx == doc.features[lc1.idxfeatname()]
            assert doc.text == TEXTS[idx]

        for doc in lc1:
            doc.features["test1"] = "updated"
            lc1.store(doc)
        assert lc1[0].features["test1"] == "updated"

        # wrap the list corpus into a shuffled corpus
        sc1 = ShuffledCorpus(lc1, seed=42)
        orig = ["00", "01", "02", "03", "04", "05", "06"]
        shuffled = ["01", "03", "04", "02", "06", "00", "05"]
        for idx, doc in enumerate(sc1):
            assert doc.text[:2] == shuffled[idx]
        for doc in sc1:
            sc1.store(doc)
        for idx, doc in enumerate(sc1):
            assert doc.text[:2] == shuffled[idx]
        for idx, doc in enumerate(lc1):
            assert doc.text[:2] == orig[idx]

        lc2 = ListCorpus.empty(10)
        assert len(lc2) == 10
        for doc in lc2:
            assert doc == None
Ejemplo n.º 6
0
    def load(
        clazz,
        from_ext=None,
        from_mem=None,
        offset_mapper=None,
        encoding="UTF-8",
        gzip=False,
        **kwargs,
    ):
        """

        Args:
          clazz:
          from_ext: (Default value = None)
          from_mem: (Default value = None)
          offset_mapper: (Default value = None)
          encoding: (Default value = "UTF-8")
          gzip: (Default value = False)
          **kwargs:

        Returns:

        """
        isurl, extstr = is_url(from_ext)
        if from_ext is not None:
            if isurl:
                if gzip:
                    from_mem = get_bytes_from_url(extstr)
                else:
                    from_mem = get_str_from_url(extstr, encoding=encoding)
        if from_mem is not None:
            if gzip:
                txt = decompress(from_mem).decode(encoding)
            else:
                txt = from_mem
            doc = Document(txt)
        else:
            if gzip:
                with gopen(extstr, "rt", encoding=encoding) as infp:
                    txt = infp.read()
            else:
                with open(extstr, "rt", encoding=encoding) as infp:
                    txt = infp.read()
            doc = Document(txt)
        return doc
Ejemplo n.º 7
0
 def __getitem__(self, idx):
     assert isinstance(idx, int)
     path = self.paths[idx]
     abspath = os.path.join(self.dirpath, path)
     doc = Document.load(abspath, fmt=self.fmt)
     doc.features[self.idxfeatname()] = idx
     doc.features["__idx"] = idx
     doc.features["__relpath"] = path
     doc.features["__abspath"] = abspath
     return doc
Ejemplo n.º 8
0
 def __setitem__(self, idx, doc):
     assert isinstance(idx, int)
     assert doc is None or isinstance(doc, Document)
     path = self.file_path_maker(idx)
     path = path + self.ext
     if doc is None:
         if self.store_none:
             if os.path.exists(path):
                 os.remove(path)
     else:
         doc = Document.save(os.path.join(self.dirpath, path), fmt=self.fmt)
Ejemplo n.º 9
0
    def test_annotationset01m01(self):
        from gatenlp.document import Document

        txt = " ".join([f"word{i:02d}" for i in range(10)])
        doc = Document(txt)
        annset = doc.annset()
        # create a Token annotation for each word
        # create "At3_1" annotations for a single token whenever i is a multiple of 3
        # create "At3_2" annotations for two tokens whenever i is a multiple of 3
        for i in range(10):
            annset.add(i * 7, i * 7 + 6, "Token", features={"i": i})
            if i % 3 == 0:
                annset.add(i * 7, i * 7 + 6, "At3_1", features={"i": i})
                # cannot span two tokens at the very end
                if i < 9:
                    annset.add(i * 7, i * 7 + 6 + 7, "At3_2", features={"i": i})
        # check: get all Token annotations
        ret = annset.with_type("Token")
        assert len(ret) == 10
        # check get all At3_1 annotations
        ret = annset.with_type("At3_1")
        assert len(ret) == 4
        ret = annset.with_type("At3_2")
        assert len(ret) == 3
        ret = annset.with_type("Token", "At3_1")
        assert len(ret) == 14
        ret = annset.with_type("At3_1", "Token")
        assert len(ret) == 14
        ret = annset.with_type("Token", "At3_1", non_overlapping=True)
        # print(f"\n!!!!!!!!!!!!DEBUG: anns for Token/At3_1={ret}")
        assert len(ret) == 10
        ret = annset.with_type("Token", "At3_2", non_overlapping=True)
        # print(f"\n!!!!!!!!!!!!DEBUG: anns for Token/At3_2={ret}")
        assert len(ret) == 10
        ret = annset.with_type("At3_1", "Token", non_overlapping=True)
        # print(f"\n!!!!!!!!!!!!DEBUG: anns for At3_1/Token={ret}")
        assert len(ret) == 10
        ret = annset.with_type("At3_2", "Token", non_overlapping=True)
        # print(f"\n!!!!!!!!!!!!DEBUG: anns for At3_2/Token={ret}")
        assert len(ret) == 7
Ejemplo n.º 10
0
 def __getitem__(self, idx):
     assert isinstance(idx, int)
     path = self.file_path_maker(idx)
     path = path + self.ext
     abspath = os.path.join(self.dirpath, path)
     if os.path.exists(path):
         doc = Document.load(abspath, fmt=self.fmt)
         doc.features[self.idxfeatname()] = idx
         doc.features["__idx"] = idx
         doc.features["__relpath"] = path
         doc.features["__abspath"] = abspath
     else:
         doc = None
     return doc
Ejemplo n.º 11
0
    def append(self, doc):
        """
        Add a document to the destination.

        Args:
            doc: the document or None, if None, no action is performed.
        """
        if doc is None:
            return
        assert isinstance(doc, Document)
        path = self.file_path_maker(doc=doc, idx=self.idx)
        path = os.path.normpath(
            path
        )  # convert forward slashes to backslashes on windows
        path = os.path.join(self.dirpath, path) + self.ext
        # check if we need to create the directories. For this we first need to get the directories part of the path,
        # which is everything left of the last slash
        if os.path.sep in path:
            dirs = path[: path.rindex(os.path.sep)]
            if not os.path.exists(os.path.normpath(dirs)):
                os.makedirs(dirs)
        Document.save(doc, path, fmt=self.fmt)
        self.idx += 1
Ejemplo n.º 12
0
def makedoc1():
    from gatenlp.document import Document

    doc1 = Document(DOC1_TEXT)
    doc1.features["feat1"] = "value1"
    anns = doc1.annset()
    anns.add(0, 2, "Type1", dict(a=1, b=True, c="some string"))
    doc1.annset("Set2").add(2, 8, "Type2")
    return doc1
Ejemplo n.º 13
0
    def test_formatmsgpack01(self):
        from gatenlp.document import Document

        curpath = os.path.abspath(os.path.curdir)
        tstpath = os.path.join(curpath, "tests")
        doc = Document.load(
            source=os.path.join(tstpath, "testdoc1.bdocmp"), fmt="text/bdocmp"
        )
        fs = doc.features
        # print("\n!!!!!!!!!!!!!!!!!!!!!!!! FEATURES=", fs)
        assert "fInt1" in fs and fs["fInt1"] == 222
        assert "fBoolean" in fs and fs["fBoolean"] == True
        assert "fString1" in fs and fs["fString1"] == "Some string"
        assert "fLong1" in fs and fs["fLong1"] == 123
        assert "fFloat1" in fs and fs["fFloat1"] == 3.4
        assert "fComplex1" in fs
        fc1 = fs["fComplex1"]
        assert "key1" in fc1
        assert "fComplex2a" in fs
        assert "fComplex2b" in fs
        assert "feat1" in fs["fComplex2a"]
        assert "feat1" in fs["fComplex2b"]
        fc2a = fs["fComplex2b"]["feat1"]
        fc2b = fs["fComplex2b"]["feat1"]
        assert "k2" in fc2a
        assert "k2" in fc2b
        assert fc2a["k2"] == fc2b["k2"]
        assert fc2a["k2"] is fc2b["k2"]
        anns = doc.annset()
        # print("\n!!!!!!!!!!!!!!!!!!!!!!!! ANNS=", anns)
        assert len(anns) == 2
        ann1, ann2 = list(anns)[0:2]
        assert ann1.type == "Type1"
        assert ann1.start == 0
        assert ann1.end == 4
        assert ann1.id == 0
        assert "fComplex2a" in ann1.features
        assert "k2" in ann1.features["fComplex2a"]
        assert fc2a["k2"] == ann1.features["fComplex2a"]["k2"]
        # msgpack does not preserve identical references
        # assert fc2a["k2"] is ann1.features["fComplex2a"]["k2"]

        assert ann2.type == "Type1"
        assert ann2.start == 5
        assert ann2.end == 8
        assert ann2.id == 1
Ejemplo n.º 14
0
    def stream2document(stream):
        """

        Args:
          stream:

        Returns:

        """
        u = Unpacker(stream)
        version = u.unpack()
        if version != MSGPACK_VERSION_HDR:
            raise Exception("MsgPack data starts with wrong version")
        doc = Document()
        doc.offset_type = u.unpack()
        doc._text = u.unpack()
        doc.name = u.unpack()
        doc._features = Features(u.unpack())
        nsets = u.unpack()
        setsdict = dict()
        doc.annotation_sets = setsdict
        for iset in range(nsets):
            sname = u.unpack()
            if sname is None:
                sname = ""
            annset = AnnotationSet(name=sname, owner_doc=doc)
            annset._next_annid = u.unpack()
            nanns = u.unpack()
            for iann in range(nanns):
                atype = u.unpack()
                astart = u.unpack()
                aend = u.unpack()
                aid = u.unpack()
                afeatures = u.unpack()
                ann = Annotation(astart,
                                 aend,
                                 atype,
                                 annid=aid,
                                 features=afeatures)
                annset._annotations[aid] = ann
            setsdict[sname] = annset
        doc._annotation_sets = setsdict
        return doc
Ejemplo n.º 15
0
    def test_offsetmapper01m01(self):
        from gatenlp.document import OffsetMapper, Document

        c_poo = "\U0001F4A9"
        c_bridge = "\U0001f309"
        doc1 = Document("01" + c_poo + "3" + c_bridge + c_bridge + c_bridge + "7")
        assert len(doc1) == 8
        assert doc1[2] == c_poo
        om1 = OffsetMapper(doc1)
        assert len(om1.java2python) == 13
        p2j = [0, 1, 2, 4, 5, 7, 9, 11, 12]
        # print("p2j={}".format(p2j), file=sys.stderr)
        # print("om1.p2j={}".format(om1.python2java), file=sys.stderr)
        assert om1.python2java == p2j
        j2p = [0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 7, 8]
        assert om1.java2python == j2p
        for i in om1.java2python:
            joff = om1.convert_to_java(i)
            poff = om1.convert_to_python(joff)
            assert poff == i
Ejemplo n.º 16
0
    def test_formatmsgpack02(self):
        from gatenlp.document import Document

        doc1 = makedoc1()
        asjson = doc1.save_mem(fmt="text/bdocmp")
        doc2 = Document.load_mem(asjson, fmt="text/bdocmp")
        assert doc2.text == DOC1_TEXT
        assert len(doc1.features) == 1
        assert doc1.features.get("feat1") == "value1"
        assert len(doc1.annset()) == 1
        assert len(doc1.annset("Set2")) == 1
        ann1 = doc1.annset().first()
        assert ann1.type == "Type1"
        assert ann1.start == 0
        assert ann1.end == 2
        assert len(ann1.features) == 3
        assert ann1.features.get("a") == 1
        assert ann1.features.get("b") == True
        assert ann1.features.get("c") == "some string"
        ann2 = doc1.annset("Set2").first()
        assert ann2.type == "Type2"
        assert ann2.start == 2
        assert ann2.end == 8
        assert len(ann2.features) == 0
Ejemplo n.º 17
0
    def test_document01m01(self):
        from gatenlp.document import Document, OFFSET_TYPE_JAVA
        from gatenlp.span import Span

        doc1 = Document(
            "This is a \U0001F4A9 document.\n이것은 문서입니다 \U0001F4A9\nЭто \U0001F4A9 документ\nاین یک سند \U0001F4A9 است"
        )
        annset1 = doc1.annset("")
        ann1 = annset1.add(8, 9, "Type1", {"f1": 1, "f2": 2})
        ann1id = ann1.id
        assert len(annset1) == 1
        assert ann1.features["f1"] == 1
        ann2id = annset1.add(0, 4, "Type1", {"f1": 13, "f2": 12}).id
        inorder = list(annset1.iter())
        assert len(inorder) == 2
        assert inorder[0].features["f1"] == 13
        assert inorder[1].features["f1"] == 1
        ann3 = annset1.add(0, 22, "Type2", {"feat1": True})
        ann3id = ann3.id
        assert ann3id in annset1
        assert annset1.span == Span(0, 22)
        retset1 = annset1.within(0, 10)
        # print("\n!!!!!!!!!!!!!!!!!!!!!!!!!!DEBUG: ", retset1)
        assert retset1.isdetached()
        assert retset1.immutable
        assert retset1.size == 2
        assert len(retset1) == 2
        assert len(annset1.within(0, 10)) == 2
        assert len(annset1.within(1, 3)) == 0
        assert len(annset1.within(0, 22)) == 3
        doc1.features["docfeat1"] = 33
        assert doc1.features["docfeat1"] == 33
        # print("DOC: {}".format(doc1), file=sys.stderr)
        jsonstr = doc1.save_mem(offset_type=OFFSET_TYPE_JAVA)
        # print("JSON JAVA: {}".format(jsonstr), file=sys.stderr)
        doc2 = Document.load_mem(jsonstr)
        # print("DOC BACK: {}".format(doc2), file=sys.stderr)
        assert doc2.features["docfeat1"] == 33
        d2annset1 = doc2.annset("")
        assert len(d2annset1) == 3
        at8 = d2annset1.startingat(8)
        # print("AT8: {}".format(at8), file=sys.stderr)
        assert len(at8) == 1
Ejemplo n.º 18
0
from gatenlp.document import Document
from gatenlp.docformats import simplejson
from gatenlp.offsetmapper import OFFSET_TYPE_JAVA

doc1 = Document("Just a simple \U0001F4A9 document.")
annset1 = doc1.annset("")
ann1 = annset1.add(0, 4, "Token", {"n": 1, "upper": True})
ann2 = annset1.add(5, 6, "Token", {"n": 2, "upper": False})
ann3 = annset1.add(7, 13, "Token", {"n": 3, "upper": False})
ann4 = annset1.add(14, 15, "Token", {"n": 4, "upper": False, "isshit": True})
ann5 = annset1.add(16, 24, "Token", {"n": 5})
annset2 = doc1.annset("Set2")
annset2.add(0, 12, "Ann1", None)
annset1.remove(ann2.id)
annset1.get(ann3.id).set_feature("str", "simple")
doc1.set_feature("docfeature1", "value1")
doc1.set_feature("docfeature1", "value1b")
simplejson.dump_file(doc1, "doc1.bdocjson")
simplejson.dump_file(doc1, "doc1.bdocjson.gz")
Ejemplo n.º 19
0
    def test_changelog01m01(self):
        from gatenlp.document import Document, OFFSET_TYPE_JAVA
        from gatenlp.changelog import ChangeLog
        from gatenlp.offsetmapper import OffsetMapper

        chlog = ChangeLog()
        doc1 = Document("Just a simple \U0001F4A9 document.", changelog=chlog)
        annset1 = doc1.annset("")
        ann1 = annset1.add(0, 4, "Token", {"n": 1, "upper": True})
        ann2 = annset1.add(5, 6, "Token", {"n": 2, "upper": False})
        ann3 = annset1.add(7, 13, "Token", {"n": 3, "upper": False})
        ann4id = annset1.add(
            14, 15, "Token", {"n": 4, "upper": False, "isshit": True}
        ).id
        ann5 = annset1.add(16, 24, "Token", {"n": 5})
        assert annset1.first().id == ann1.id
        assert annset1.last().id == ann5.id
        annset2 = doc1.annset("Set2")
        annset2.add(0, 12, "Ann1", None)
        annset1.remove(ann2)
        ann3b = annset1.get(ann3.id)
        ann3b.features["str"] = "simple"
        doc1.features["docfeature1"] = "value1"
        doc1.features["docfeature1"] = "value1b"
        chlog1 = doc1.changelog
        # print("!!!!!!!!!!!!!!DEBUG: ",chlog1.pprint())
        assert chlog1.changes[4].get("end") == 24
        assert chlog.changes[4].get("end") == 24
        om = OffsetMapper(doc1)
        jsonstr = chlog.save_mem(offset_type=OFFSET_TYPE_JAVA, offset_mapper=om)
        chlog2 = ChangeLog.load_mem(jsonstr, offset_mapper=om)
        assert chlog.changes[4].get("end") == 24
        assert chlog1.changes[4].get("end") == 24
        assert chlog2.changes[4].get("end") == 24

        # check if adding the changelog later works
        chlog = ChangeLog()
        doc1 = Document("Just a simple \U0001F4A9 document.")
        doc1.changelog = chlog
        annset1 = doc1.annset("")
        ann1 = annset1.add(0, 4, "Token", {"n": 1, "upper": True})
        ann2 = annset1.add(5, 6, "Token", {"n": 2, "upper": False})
        ann3 = annset1.add(7, 13, "Token", {"n": 3, "upper": False})
        ann4 = annset1.add(14, 15, "Token", {"n": 4, "upper": False, "isshit": True})
        ann5 = annset1.add(16, 24, "Token", {"n": 5})
        annset2 = doc1.annset("Set2")
        annset2.add(0, 12, "Ann1", None)
        annset1.remove(ann2.id)
        ann3b = annset1.get(ann3.id)
        ann3b.features["str"] = "simple"
        doc1.features["docfeature1"] = "value1"
        doc1.features["docfeature1"] = "value1b"
        assert len(doc1.changelog) == len(chlog1)

        # test removing all annotations
        assert len(annset1) == 4
        annset1.clear()
        assert len(annset1) == 0
Ejemplo n.º 20
0
    def load(
        clazz,
        from_ext=None,
        from_mem=None,
        parser=None,
        markup_set_name="Original markups",
        process_soup=None,
        offset_mapper=None,
        **kwargs,
    ):
        """Load a HTML file.

        Args:
          clazz: param from_ext:
          from_mem: param parser: one of "html.parser", "lxml", "lxml-xml", "html5lib" (default is "lxml")
          markup_set_name: the annotation set name for the set to contain the HTML annotations (Default value = "Original markups")
          process_soup: a function to run on the parsed HTML soup before converting (Default value = None)
          offset_mapper: param kwargs: (Default value = None)
          from_ext: (Default value = None)
          parser: (Default value = None)
          **kwargs:

        Returns:

        """
        # NOTE: for now we have a simple heuristic for adding newlines to the text:
        # before and after a block element, a newline is added unless there is already one
        # NOTE: for now we use  multi_valued_attributes=None which prevents attributes of the
        # form "class='val1 val2'" to get converted into features with a list of values.
        isurl, extstr = is_url(from_ext)
        if from_ext is not None:
            if isurl:
                from_mem = get_str_from_url(extstr)
        if from_mem:
            bs = BeautifulSoup(from_mem, parser, multi_valued_attributes=None)
        else:
            bs = BeautifulSoup(extstr, parser, multi_valued_attributes=None)
        # we recursively iterate the tree depth first, going through the children
        # and adding to a list that either contains the text or a dict with the information
        # about annotations we want to add
        nlels = {
            "pre",
            "br",
            "p",
            "div",
            "tr",
            "h1",
            "h2",
            "h3",
            "h4",
            "h5",
            "h6",
            "li",
            "address",
            "article",
            "aside",
            "blockquote",
            "del",
            "figure",
            "figcaption",
            "footer",
            "header",
            "hr",
            "ins",
            "main",
            "nav",
            "section",
            "summary",
            "input",
            "legend",
            "option",
            "textarea",
            "bdi",
            "bdo",
            "center",
            "code",
            "dfn",
            "menu",
            "dir",
            "caption",
        }
        ignoreels = {"script", "style"}
        docinfo = {"anninfos": [], "curoffset": 0, "curid": 0, "text": ""}

        def walktree(el):
            """

            Args:
              el:

            Returns:

            """
            # print("DEBUG: type=", type(el))
            if isinstance(el, bs4.element.Doctype):
                # print("DEBUG: got doctype", type(el))
                pass
            elif isinstance(el, bs4.element.Comment):
                # print("DEBUG: got Comment", type(el))
                pass
            elif isinstance(el, bs4.element.Script):
                # print("DEBUG: got Script", type(el))
                pass
            elif isinstance(el, bs4.element.Tag):
                # print("DEBUG: got tag: ", type(el), " name=",el.name)
                # some tags we ignore completely:
                if el.name in ignoreels:
                    return
                # for some tags we insert a new line before, but only if we do not already have one
                if not docinfo["text"].endswith("\n") and el.name in nlels:
                    docinfo["text"] += "\n"
                    # print("DEBUG: adding newline before at ", docinfo["curoffset"])
                    docinfo["curoffset"] += 1
                ann = {
                    "type": el.name,
                    "features": el.attrs,
                    "id": docinfo["curid"],
                    "event": "start",
                    "start": docinfo["curoffset"],
                }
                thisid = docinfo["curid"]
                docinfo["anninfos"].append(ann)
                docinfo["curid"] += 1
                for child in el.children:
                    walktree(child)
                # for some tags we insert a new line after
                if not docinfo["text"].endswith("\n") and el.name in nlels:
                    docinfo["text"] += "\n"
                    # print("DEBUG: adding newline after at ", docinfo["curoffset"])
                    docinfo["curoffset"] += 1
                docinfo["anninfos"].append({
                    "event": "end",
                    "id": thisid,
                    "end": docinfo["curoffset"]
                })
            elif isinstance(el, bs4.element.NavigableString):
                # print("DEBUG: got text: ", el)
                text = str(el)
                if text == "\n" and docinfo["text"].endswith("\n"):
                    return
                docinfo["text"] += text
                docinfo["curoffset"] += len(el)
            else:
                print("WARNING: odd element type", type(el))

        walktree(bs)
        # need to add the end corresponding to bs
        # print("DEBUG: got docinfo:\n",docinfo)
        id2anninfo = {}  # from id to anninfo
        nstart = 0
        for anninfo in docinfo["anninfos"]:
            if anninfo["event"] == "start":
                nstart += 1
                id2anninfo[anninfo["id"]] = anninfo
        nend = 0
        for anninfo in docinfo["anninfos"]:
            if anninfo["event"] == "end":
                nend += 1
                end = anninfo["end"]
                annid = anninfo["id"]
                anninfo = id2anninfo[annid]
                anninfo["end"] = end
        # print("DEBUG: got nstart/nend", nstart, nend)
        assert nstart == nend
        # print("DEBUG: got id2anninfo:\n", id2anninfo)
        doc = Document(docinfo["text"])
        annset = doc.annset(markup_set_name)
        for i in range(nstart):
            anninfo = id2anninfo[i]
            annset.add(
                anninfo["start"],
                anninfo["end"],
                anntype=anninfo["type"],
                features=anninfo["features"],
            )
        return doc
Ejemplo n.º 21
0
 def __iter__(self):
     """
     Yield the next document from the source.
     """
     for p in self.paths:
         yield Document.load(os.path.join(self.dirpath, p), fmt=self.fmt)
Ejemplo n.º 22
0
def interact(args=None, annotator=None):
    """Starts and handles the interaction with a GATE python plugin process.
    This will get started by the GATE plugin if the interaction uses
    pipes, but can also be started separately for http/websockets.

    This MUST be called in the user's python file!
    The python file should also have one class or function decorated
    with the @gatenlp.PR  decorator to identify it as the
    processing resource to the system.

    :return:

    Args:
      args:  (Default value = None)

    Returns:

    """
    logger = init_logger(__name__)
    loglvls = {
        "DEBUG": logging.DEBUG,
        "INFO": logging.INFO,
        "WARNING": logging.WARNING,
        "ERROR": logging.ERROR,
        "CRITICAL": logging.CRITICAL,
    }
    # before we do anything we need to check if a PR has actually
    # been defined. If not, use our own default debugging PR
    if gatenlp.gate_python_plugin_pr is None and annotator is None:
        logger.warning(
            "No processing resource defined with @GateNlpPr decorator or passed to interact, using default do-nothing"
        )
        _pr_decorator(DefaultPr)
    if annotator is not None:
        pr = _pr_decorator(annotator)
    else:
        pr = gatenlp.gate_python_plugin_pr

    if args is None:
        args = get_arguments()
    if args.d:
        logger.setLevel(logging.DEBUG)
    if args.log_lvl:
        if args.log_lvl not in loglvls:
            raise Exception("Not a valid log level: {}".format(args.log_lvl))
        logger.setLevel(loglvls[args.log_lvl])

    if args.mode == "check":
        return

    logger.info("Using gatenlp version {}\n".format(gatenlp.__version__))

    logger.debug("Starting interaction args={}".format(args))
    if args.mode == "pipe":
        if args.format != "json":
            raise Exception(
                "For interaction mode pipe, only format=json is supported")
        for line in instream:
            try:
                request = json.loads(line)
            except Exception as ex:
                logger.error("Unable to load from JSON:\n{}".format(line))
                raise ex
            logger.debug("Got request object: {}".format(request))
            cmd = request.get("command", None)
            stop_requested = False
            ret = None
            try:
                if cmd == "execute":
                    doc = Document.from_dict(request.get("data"))
                    om = doc.to_offset_type(OFFSET_TYPE_PYTHON)
                    doc.changelog = ChangeLog()
                    pr.execute(doc)
                    # NOTE: for now we just discard what the method returns and always return
                    # the changelog instead!
                    chlog = doc.changelog
                    # if we got an offset mapper earlier, we had to convert, so we convert back to JAVA
                    if om:
                        # replace True is faster, and we do not need the ChangeLog any more!
                        chlog.fixup_changes(offset_mapper=om,
                                            offset_type=OFFSET_TYPE_JAVA,
                                            replace=True)
                    ret = doc.changelog.to_dict()
                    logger.debug("Returning CHANGELOG: {}".format(ret))
                elif cmd == "start":
                    parms = request.get("data")
                    pr.start(parms)
                elif cmd == "finish":
                    ret = pr.finish()
                elif cmd == "reduce":
                    results = request.get("data")
                    ret = pr.reduce(results)
                elif cmd == "stop":
                    stop_requested = True
                else:
                    raise Exception("Odd command received: {}".format(cmd))
                response = {
                    "data": ret,
                    "status": "ok",
                }
            except Exception as ex:
                error = repr(ex)
                tb_str = traceback.format_exception(etype=type(ex),
                                                    value=ex,
                                                    tb=ex.__traceback__)
                print("ERROR when running python code:", file=sys.stderr)
                for line in tb_str:
                    print(line, file=sys.stderr, end=""
                          )  # what we get from traceback already has new lines
                info = "".join(tb_str)
                # in case we want the actual stacktrace data as well:
                st = [(f.filename, f.lineno, f.name, f.line)
                      for f in traceback.extract_tb(ex.__traceback__)]
                response = {
                    "data": None,
                    "status": "error",
                    "error": error,
                    "info": info,
                    "stacktrace": st,
                }
            logger.debug("Sending back response: {}".format(response))
            print(json.dumps(response), file=ostream)

            ostream.flush()
            if stop_requested:
                break
        # TODO: do any cleanup/restoring needed
        logger.debug("Finishing interaction")
    elif args.mode == "http":
        raise Exception("Mode http not implemented yet")
    elif args.mode == "websockets":
        raise Exception("Mode websockets not implemented yet")
    elif args.mode in ["file", "dir"]:
        if not args.path:
            raise Exception("Mode file or dir but no --path specified")
        fileext = ".bdoc" + args.format
        if args.mode == "file" and not os.path.isfile(args.path):
            raise Exception("Mode file but path is not a file: {}".format(
                args.path))
        elif args.mode == "dir" and not os.path.isdir(args.path):
            raise Exception("Mode dir but path is not a directory: {}".format(
                args.path))
        if args.mode == "file":
            pr.start({})
            logger.info(f"Loading file {args.path}")
            doc = Document.load(args.path)
            pr.execute(doc)
            pr.finish()
            if args.out:
                logger.info(f"Saving file to {args.out}")
                doc.save(args.out)
            else:
                logger.info(f"Saving file to {args.path}")
                doc.save(args.path)
        else:
            import glob

            pr.start({})
            files = glob.glob(args.path + os.path.sep + "*" + fileext)
            for file in files:
                logger.info("Loading file {}".format(file))
                doc = Document.load(file)
                pr.execute(doc)
                if args.out:
                    tofile = os.path.join(args.out, os.path.basename(file))
                    logger.info("Saving to {}".format(tofile))
                    doc.save(tofile)
                else:
                    logger.info("Saving to {}".format(file))
                    doc.save(file)
            pr.finish()
    else:
        raise Exception("Not a valid mode: {}".format(args.mode))
Ejemplo n.º 23
0
    def load(clazz, from_ext=None, ignore_unknown_types=False):
        """

        Args:
          clazz:
          from_ext: (Default value = None)
          ignore_unknown_types: (Default value = False)

        Returns:

        """
        # TODO: the code below is just an outline and needs work!
        # TODO: make use of the test document created in repo project-python-gatenlp
        import xml.etree.ElementTree as ET

        isurl, extstr = is_url(from_ext)
        if isurl:
            xmlstring = get_str_from_url(extstr, encoding="utf-8")
            root = ET.fromstring(xmlstring)
        else:
            tree = ET.parse(extstr)
            root = tree.getroot()

        # or: root = ET.fromstring(xmlstring)

        # check we do have a GATE document

        assert root.tag == "GateDocument"
        assert root.attrib == {"version": "3"}

        def parsefeatures(feats):
            """

            Args:
              feats:

            Returns:

            """
            features = {}
            for feat in list(feats):
                name = None
                value = None
                for el in list(feat):
                    if el.tag == "Name":
                        if el.get("className") == "java.lang.String":
                            name = el.text
                        else:
                            raise Exception("Odd Feature Name type: " +
                                            el.get("className"))
                    elif el.tag == "Value":
                        cls_name = el.get("className")
                        if cls_name == "java.lang.String":
                            value = el.text
                        elif cls_name == "java.lang.Integer":
                            value = int(el.text)
                        elif cls_name == "java.lang.Long":
                            value = int(el.text)
                        elif cls_name == "java.math.BigDecimal":
                            value = float(el.text)
                        elif cls_name == "java.lang.Boolean":
                            value = bool(el.text)
                        # elif cls_name == "gate.corpora.ObjectWrapper":
                        #    value = GateXmlLoader.value4objectwrapper(el.text)
                        else:
                            if ignore_unknown_types:
                                print(
                                    f"Warning: ignoring feature with serialization type: {cls_name}",
                                    file=sys.stderr,
                                )
                            else:
                                raise Exception(
                                    "Unsupported serialization type: " +
                                    el.get("className"))
                if name is not None and value is not None:
                    features[name] = value
            return features

        # get the document features
        docfeatures = {}
        feats = root.findall("./GateDocumentFeatures/Feature")

        docfeatures = parsefeatures(feats)

        textwithnodes = root.findall("./TextWithNodes")
        text = ""
        node2offset = {}
        curoff = 0
        for item in textwithnodes:
            if item.text:
                print("Got item text: ", item.text)
                text += item.text
                # TODO HTML unescape item text
                curoff += len(item.text)
            for node in item:
                nodeid = node.get("id")
                node2offset[nodeid] = curoff
                if node.tail:
                    # TODO: unescape item.text?
                    print("Gote node tail: ", node.tail)
                    text += node.tail
                    curoff += len(node.tail)

        annsets = root.findall("./AnnotationSet")

        annotation_sets = {}  # map name - set
        for annset in annsets:
            if annset.get("Name"):
                setname = annset.get("Name")
            else:
                setname = ""
            annots = annset.findall("./Annotation")
            annotations = []
            maxannid = 0
            for ann in annots:
                annid = int(ann.attrib["Id"])
                maxannid = max(maxannid, annid)
                anntype = ann.attrib["Type"]
                startnode = ann.attrib["StartNode"]
                endnode = ann.attrib["EndNode"]
                startoff = node2offset[startnode]
                endoff = node2offset[endnode]
                feats = ann.findall("./Feature")
                features = parsefeatures(feats)
                if len(features) == 0:
                    features = None
                annotation = {
                    "id": annid,
                    "type": anntype,
                    "start": startoff,
                    "end": endoff,
                    "features": features,
                }
                annotations.append(annotation)
            annset = {
                "name": setname,
                "annotations": annotations,
                "next_annid": maxannid + 1,
            }
            annotation_sets[setname] = annset

        docmap = {
            "text": text,
            "features": docfeatures,
            "offset_type": "p",
            "annotation_sets": annotation_sets,
        }

        doc = Document.from_dict(docmap)
        return doc
Ejemplo n.º 24
0
    def load(clazz,
             from_ext=None,
             from_mem=None,
             include_fields=None,
             include_entities=True,
             include_quote=False,
             outsetname="Original markups",
             tweet_ann="Tweet"):
        """
        Load a tweet from Twitter JSON format.

        IMPORTANT: this is still very experimental, will change in the future!

        Args:
            clazz: internal use
            from_ext: the file/url to load from
            from_mem: string to load from
            include_fields: a list of fields to include where nested field names are dot-separated, e.g.
               "user.location". All these fields are included using the nested field name in either the
               features of the tweet annotation with the Type specified, or the features of the document
               if `tweet_ann` is None.
            include_entities: create annotations for the tweet entities in the set with outsetname
            include_quote: if True, add the quoted tweet after an empty line and treat it as a separate
               tweet just like the original tweet.
            outset: the annotation set where to put entity annotations and the tweet annotation(s)
            tweet_ann: the annotation type to use to span the tweet and contain all the features.

        Returns:
            document representing the tweet
        """
        if from_ext is not None:
            isurl, extstr = is_url(from_ext)
            if isurl:
                jsonstr = get_str_from_url(extstr, encoding="utf-8")
                tweet = json.loads(jsonstr)
            else:
                with open(extstr, "rt", encoding="utf-8") as infp:
                    tweet = json.load(infp)
        elif from_mem is not None:
            tweet = json.loads(from_mem)
        else:
            raise Exception("Cannot load from None")
        if tweet is None:
            raise Exception("Could not decode Tweet JSON")
        if tweet.get("truncated"):
            text = get_nested(tweet, "extended_tweet.full_text")
        else:
            text = get_nested(tweet, "text")
        if text is None:
            raise Exception("No text field found")
        quoted_status = None
        if include_quote:
            quoted_status = tweet.get("quoted_status")
            if quoted_status is not None:
                qtext = quoted_status.get("text", "")
                text += "\n" + qtext
        doc = Document(text)
        anns = doc.annset(outsetname)
        if tweet_ann:
            ann = anns.add(0, len(text), tweet_ann)
            features = ann.features
        else:
            features = doc.features
        if include_fields is None:
            include_fields = TWITTER_DEFAULT_INCLUDE_FIELDS
        for field in include_fields:
            if field.startswith("$"):
                if field == "$is_retweet_status":
                    rs = get_nested(tweet, "retweeted_status", silent=True)
                    if rs is not None:
                        features[field] = True
                continue
            val = get_nested(tweet, field, silent=True)
            if val is not None:
                features[field] = val
        if include_entities:
            if tweet.get("truncated"):
                entities = get_nested(tweet,
                                      "extended_tweet.entities",
                                      default={})
            else:
                entities = get_nested(tweet, "entities", default={})
        for etype, elist in entities.items():
            for ent in elist:
                start, end = ent["indices"]
                anns.add(start, end, etype)
        # TODO: if we have a quoted_status, add features and entities from there:
        # Essentially the same processing as for the original tweet, but at document offset
        # len(tweet)+1 (2?)
        return doc
Ejemplo n.º 25
0
    def append(
        self,
        source,
        fmt="gate-def",
        source_sep="\t",
        source_encoding="UTF-8",
        listfeatures=None,
        listtype=None,
    ):
        """
        This method appends more entries to gazetteer.

        Args:
            source: where to load the gazetteer from. What is actually expected here depends on the fmt
              parameter.
            fmt: defines what is expected as the format and/or content of the source parameter. One of:
               *  "gate-def" (default): source must be a string, a pathlib Path or a parsed urllib url and
                  point to a GATE-style "def" file. See https://gate.ac.uk/userguide/chap:gazetteers
               * "gazlist": a list of tuples or lists where the first element of the tuple/list
                  is a list of strings, the second element is a dictionary containing the features to assign and
                  the third element, if it exists, is the index of an element in the listfeatures array.
            listfeatures: a list of dictionaries containing the features to set for all matches witch have the
              list index set, this list gets appended to the existing listfeatures. If what gets appended specifies
              its own list features, this is ignored.
            listtype: the output annotation type to use for the list that gets appended. If what gets appended
               specifies its own list type or list types, this is ignored.
        """
        if fmt == "gazlist":
            if listfeatures is not None:
                self.listfeatures.append(listfeatures)
            else:
                self.listfeatures.append({})
            if listtype is not None:
                self.listtypes.append(listtype)
            else:
                self.listtypes.append(self.outtype)
            listidx = len(self.listfeatures) - 1
            for el in source:
                entry = el[0]
                data = el[1]
                self.add(entry, data, listidx=listidx)
        elif fmt == "gate-def":
            if listfeatures is None:
                listfeatures = {}
            if listtype is None:
                listtype = self.outtype
            with open(source, "rt", encoding=source_encoding) as infp:
                for line in infp:
                    line = line.rstrip("\n\r")
                    fields = line.split(":")
                    fields.extend(["", "", "", ""])
                    listFile = fields[0]
                    majorType = fields[1]
                    minorType = fields[2]
                    languages = fields[3]
                    anntype = fields[4]
                    this_listfeatures = listfeatures.copy()
                    this_outtype = listtype
                    if majorType:
                        this_listfeatures["majorType"] = majorType
                    if minorType:
                        this_listfeatures["minorType"] = minorType
                    if languages:
                        this_listfeatures["lang"] = languages
                    if anntype:
                        this_outtype = anntype
                    # read in the actual list
                    listfile = os.path.join(os.path.dirname(source), listFile)
                    self.logger.info(f"Reading list file {listfile}")
                    with open(listfile, "rt", encoding=source_encoding) as inlistfile:
                        self.listtypes.append(this_outtype)
                        self.listfeatures.append(this_listfeatures)
                        linenr = 0
                        for listline in inlistfile:
                            linenr += 1
                            listline = listline.rstrip("\n\r")
                            fields = listline.split(source_sep)
                            entry = fields[0]
                            if self.tokenizer:
                                tmpdoc = Document(entry)
                                self.tokenizer(tmpdoc)
                                # TODO: include and handle SpaceToken if we use the speparator annoations!
                                # TODO: maybe have a different way to retrieve the token annotations based
                                # on the tokenizer????
                                tokenanns = list(tmpdoc.annset().with_type("Token"))
                                if self.getterfunc:
                                    tokenstrings = [
                                        self.getterfunc(a, doc=tmpdoc)
                                        for a in tokenanns
                                    ]
                                else:
                                    tokenstrings = [tmpdoc[a] for a in tokenanns]
                                if self.mapfunc:
                                    tokenstrings = [
                                        self.mapfunc(s) for s in tokenstrings
                                    ]
                                if self.ignorefunc:
                                    tokenstrings = [
                                        s
                                        for s in tokenstrings
                                        if not self.ignorefunc(s)
                                    ]
                            else:
                                tokenstrings = entry.split()  # just split on whitespace
                            if len(tokenstrings) == 0:
                                self.logger.warn(
                                    f"File {listfile}, skipping line {linenr}, no tokens left: {listline}"
                                )
                                continue
                            if len(entry) > 1:
                                feats = {}
                                for fspec in fields[1:]:
                                    fname, fval = fspec.split("=")
                                    feats[fname] = fval
                            else:
                                feats = None
                            listidx = len(self.listfeatures) - 1
                            self.add(tokenstrings, feats, listidx=listidx)
        else:
            raise Exception(f"TokenGazetteer format {fmt} not known")
Ejemplo n.º 26
0
 def __iter__(self):
     with open(self.file, "rt", encoding="utf-8") as infp:
         for line in infp:
             yield Document.load_mem(line, fmt="json")