Ejemplo n.º 1
0
    def test_changelog01m01(self):
        from gatenlp.document import Document, OFFSET_TYPE_JAVA
        from gatenlp.changelog import ChangeLog
        from gatenlp.offsetmapper import OffsetMapper

        chlog = ChangeLog()
        doc1 = Document("Just a simple \U0001F4A9 document.", changelog=chlog)
        annset1 = doc1.annset("")
        ann1 = annset1.add(0, 4, "Token", {"n": 1, "upper": True})
        ann2 = annset1.add(5, 6, "Token", {"n": 2, "upper": False})
        ann3 = annset1.add(7, 13, "Token", {"n": 3, "upper": False})
        ann4id = annset1.add(
            14, 15, "Token", {"n": 4, "upper": False, "isshit": True}
        ).id
        ann5 = annset1.add(16, 24, "Token", {"n": 5})
        assert annset1.first().id == ann1.id
        assert annset1.last().id == ann5.id
        annset2 = doc1.annset("Set2")
        annset2.add(0, 12, "Ann1", None)
        annset1.remove(ann2)
        ann3b = annset1.get(ann3.id)
        ann3b.features["str"] = "simple"
        doc1.features["docfeature1"] = "value1"
        doc1.features["docfeature1"] = "value1b"
        chlog1 = doc1.changelog
        # print("!!!!!!!!!!!!!!DEBUG: ",chlog1.pprint())
        assert chlog1.changes[4].get("end") == 24
        assert chlog.changes[4].get("end") == 24
        om = OffsetMapper(doc1)
        jsonstr = chlog.save_mem(offset_type=OFFSET_TYPE_JAVA, offset_mapper=om)
        chlog2 = ChangeLog.load_mem(jsonstr, offset_mapper=om)
        assert chlog.changes[4].get("end") == 24
        assert chlog1.changes[4].get("end") == 24
        assert chlog2.changes[4].get("end") == 24

        # check if adding the changelog later works
        chlog = ChangeLog()
        doc1 = Document("Just a simple \U0001F4A9 document.")
        doc1.changelog = chlog
        annset1 = doc1.annset("")
        ann1 = annset1.add(0, 4, "Token", {"n": 1, "upper": True})
        ann2 = annset1.add(5, 6, "Token", {"n": 2, "upper": False})
        ann3 = annset1.add(7, 13, "Token", {"n": 3, "upper": False})
        ann4 = annset1.add(14, 15, "Token", {"n": 4, "upper": False, "isshit": True})
        ann5 = annset1.add(16, 24, "Token", {"n": 5})
        annset2 = doc1.annset("Set2")
        annset2.add(0, 12, "Ann1", None)
        annset1.remove(ann2.id)
        ann3b = annset1.get(ann3.id)
        ann3b.features["str"] = "simple"
        doc1.features["docfeature1"] = "value1"
        doc1.features["docfeature1"] = "value1b"
        assert len(doc1.changelog) == len(chlog1)

        # test removing all annotations
        assert len(annset1) == 4
        annset1.clear()
        assert len(annset1) == 0
Ejemplo n.º 2
0
def makedoc1():
    from gatenlp.document import Document

    doc1 = Document(DOC1_TEXT)
    doc1.features["feat1"] = "value1"
    anns = doc1.annset()
    anns.add(0, 2, "Type1", dict(a=1, b=True, c="some string"))
    doc1.annset("Set2").add(2, 8, "Type2")
    return doc1
Ejemplo n.º 3
0
def makedoc1():
    doc1 = Document(DOC1_TEXT)
    set1 = doc1.annset()
    whitespaces = [
        m for m in re.finditer(r"[\s,.!?]+|^[\s,.!?]*|[\s,.!?]*$", DOC1_TEXT)
    ]
    nrtokens = len(whitespaces) - 1
    for k in range(nrtokens):
        fromoff = whitespaces[k].end()
        tooff = whitespaces[k + 1].start()
        set1.add(fromoff, tooff, "Token")
    return doc1
Ejemplo n.º 4
0
    def test_document01m01(self):
        from gatenlp.document import Document, OFFSET_TYPE_JAVA
        from gatenlp.span import Span

        doc1 = Document(
            "This is a \U0001F4A9 document.\n이것은 문서입니다 \U0001F4A9\nЭто \U0001F4A9 документ\nاین یک سند \U0001F4A9 است"
        )
        annset1 = doc1.annset("")
        ann1 = annset1.add(8, 9, "Type1", {"f1": 1, "f2": 2})
        ann1id = ann1.id
        assert len(annset1) == 1
        assert ann1.features["f1"] == 1
        ann2id = annset1.add(0, 4, "Type1", {"f1": 13, "f2": 12}).id
        inorder = list(annset1.iter())
        assert len(inorder) == 2
        assert inorder[0].features["f1"] == 13
        assert inorder[1].features["f1"] == 1
        ann3 = annset1.add(0, 22, "Type2", {"feat1": True})
        ann3id = ann3.id
        assert ann3id in annset1
        assert annset1.span == Span(0, 22)
        retset1 = annset1.within(0, 10)
        # print("\n!!!!!!!!!!!!!!!!!!!!!!!!!!DEBUG: ", retset1)
        assert retset1.isdetached()
        assert retset1.immutable
        assert retset1.size == 2
        assert len(retset1) == 2
        assert len(annset1.within(0, 10)) == 2
        assert len(annset1.within(1, 3)) == 0
        assert len(annset1.within(0, 22)) == 3
        doc1.features["docfeat1"] = 33
        assert doc1.features["docfeat1"] == 33
        # print("DOC: {}".format(doc1), file=sys.stderr)
        jsonstr = doc1.save_mem(offset_type=OFFSET_TYPE_JAVA)
        # print("JSON JAVA: {}".format(jsonstr), file=sys.stderr)
        doc2 = Document.load_mem(jsonstr)
        # print("DOC BACK: {}".format(doc2), file=sys.stderr)
        assert doc2.features["docfeat1"] == 33
        d2annset1 = doc2.annset("")
        assert len(d2annset1) == 3
        at8 = d2annset1.startingat(8)
        # print("AT8: {}".format(at8), file=sys.stderr)
        assert len(at8) == 1
Ejemplo n.º 5
0
    def test_annotationset01m01(self):
        from gatenlp.document import Document

        txt = " ".join([f"word{i:02d}" for i in range(10)])
        doc = Document(txt)
        annset = doc.annset()
        # create a Token annotation for each word
        # create "At3_1" annotations for a single token whenever i is a multiple of 3
        # create "At3_2" annotations for two tokens whenever i is a multiple of 3
        for i in range(10):
            annset.add(i * 7, i * 7 + 6, "Token", features={"i": i})
            if i % 3 == 0:
                annset.add(i * 7, i * 7 + 6, "At3_1", features={"i": i})
                # cannot span two tokens at the very end
                if i < 9:
                    annset.add(i * 7, i * 7 + 6 + 7, "At3_2", features={"i": i})
        # check: get all Token annotations
        ret = annset.with_type("Token")
        assert len(ret) == 10
        # check get all At3_1 annotations
        ret = annset.with_type("At3_1")
        assert len(ret) == 4
        ret = annset.with_type("At3_2")
        assert len(ret) == 3
        ret = annset.with_type("Token", "At3_1")
        assert len(ret) == 14
        ret = annset.with_type("At3_1", "Token")
        assert len(ret) == 14
        ret = annset.with_type("Token", "At3_1", non_overlapping=True)
        # print(f"\n!!!!!!!!!!!!DEBUG: anns for Token/At3_1={ret}")
        assert len(ret) == 10
        ret = annset.with_type("Token", "At3_2", non_overlapping=True)
        # print(f"\n!!!!!!!!!!!!DEBUG: anns for Token/At3_2={ret}")
        assert len(ret) == 10
        ret = annset.with_type("At3_1", "Token", non_overlapping=True)
        # print(f"\n!!!!!!!!!!!!DEBUG: anns for At3_1/Token={ret}")
        assert len(ret) == 10
        ret = annset.with_type("At3_2", "Token", non_overlapping=True)
        # print(f"\n!!!!!!!!!!!!DEBUG: anns for At3_2/Token={ret}")
        assert len(ret) == 7
Ejemplo n.º 6
0
    def append(
        self,
        source,
        fmt="gate-def",
        source_sep="\t",
        source_encoding="UTF-8",
        listfeatures=None,
        listtype=None,
    ):
        """
        This method appends more entries to gazetteer.

        Args:
            source: where to load the gazetteer from. What is actually expected here depends on the fmt
              parameter.
            fmt: defines what is expected as the format and/or content of the source parameter. One of:
               *  "gate-def" (default): source must be a string, a pathlib Path or a parsed urllib url and
                  point to a GATE-style "def" file. See https://gate.ac.uk/userguide/chap:gazetteers
               * "gazlist": a list of tuples or lists where the first element of the tuple/list
                  is a list of strings, the second element is a dictionary containing the features to assign and
                  the third element, if it exists, is the index of an element in the listfeatures array.
            listfeatures: a list of dictionaries containing the features to set for all matches witch have the
              list index set, this list gets appended to the existing listfeatures. If what gets appended specifies
              its own list features, this is ignored.
            listtype: the output annotation type to use for the list that gets appended. If what gets appended
               specifies its own list type or list types, this is ignored.
        """
        if fmt == "gazlist":
            if listfeatures is not None:
                self.listfeatures.append(listfeatures)
            else:
                self.listfeatures.append({})
            if listtype is not None:
                self.listtypes.append(listtype)
            else:
                self.listtypes.append(self.outtype)
            listidx = len(self.listfeatures) - 1
            for el in source:
                entry = el[0]
                data = el[1]
                self.add(entry, data, listidx=listidx)
        elif fmt == "gate-def":
            if listfeatures is None:
                listfeatures = {}
            if listtype is None:
                listtype = self.outtype
            with open(source, "rt", encoding=source_encoding) as infp:
                for line in infp:
                    line = line.rstrip("\n\r")
                    fields = line.split(":")
                    fields.extend(["", "", "", ""])
                    listFile = fields[0]
                    majorType = fields[1]
                    minorType = fields[2]
                    languages = fields[3]
                    anntype = fields[4]
                    this_listfeatures = listfeatures.copy()
                    this_outtype = listtype
                    if majorType:
                        this_listfeatures["majorType"] = majorType
                    if minorType:
                        this_listfeatures["minorType"] = minorType
                    if languages:
                        this_listfeatures["lang"] = languages
                    if anntype:
                        this_outtype = anntype
                    # read in the actual list
                    listfile = os.path.join(os.path.dirname(source), listFile)
                    self.logger.info(f"Reading list file {listfile}")
                    with open(listfile, "rt", encoding=source_encoding) as inlistfile:
                        self.listtypes.append(this_outtype)
                        self.listfeatures.append(this_listfeatures)
                        linenr = 0
                        for listline in inlistfile:
                            linenr += 1
                            listline = listline.rstrip("\n\r")
                            fields = listline.split(source_sep)
                            entry = fields[0]
                            if self.tokenizer:
                                tmpdoc = Document(entry)
                                self.tokenizer(tmpdoc)
                                # TODO: include and handle SpaceToken if we use the speparator annoations!
                                # TODO: maybe have a different way to retrieve the token annotations based
                                # on the tokenizer????
                                tokenanns = list(tmpdoc.annset().with_type("Token"))
                                if self.getterfunc:
                                    tokenstrings = [
                                        self.getterfunc(a, doc=tmpdoc)
                                        for a in tokenanns
                                    ]
                                else:
                                    tokenstrings = [tmpdoc[a] for a in tokenanns]
                                if self.mapfunc:
                                    tokenstrings = [
                                        self.mapfunc(s) for s in tokenstrings
                                    ]
                                if self.ignorefunc:
                                    tokenstrings = [
                                        s
                                        for s in tokenstrings
                                        if not self.ignorefunc(s)
                                    ]
                            else:
                                tokenstrings = entry.split()  # just split on whitespace
                            if len(tokenstrings) == 0:
                                self.logger.warn(
                                    f"File {listfile}, skipping line {linenr}, no tokens left: {listline}"
                                )
                                continue
                            if len(entry) > 1:
                                feats = {}
                                for fspec in fields[1:]:
                                    fname, fval = fspec.split("=")
                                    feats[fname] = fval
                            else:
                                feats = None
                            listidx = len(self.listfeatures) - 1
                            self.add(tokenstrings, feats, listidx=listidx)
        else:
            raise Exception(f"TokenGazetteer format {fmt} not known")
Ejemplo n.º 7
0
    def load(
        clazz,
        from_ext=None,
        from_mem=None,
        parser=None,
        markup_set_name="Original markups",
        process_soup=None,
        offset_mapper=None,
        **kwargs,
    ):
        """Load a HTML file.

        Args:
          clazz: param from_ext:
          from_mem: param parser: one of "html.parser", "lxml", "lxml-xml", "html5lib" (default is "lxml")
          markup_set_name: the annotation set name for the set to contain the HTML annotations (Default value = "Original markups")
          process_soup: a function to run on the parsed HTML soup before converting (Default value = None)
          offset_mapper: param kwargs: (Default value = None)
          from_ext: (Default value = None)
          parser: (Default value = None)
          **kwargs:

        Returns:

        """
        # NOTE: for now we have a simple heuristic for adding newlines to the text:
        # before and after a block element, a newline is added unless there is already one
        # NOTE: for now we use  multi_valued_attributes=None which prevents attributes of the
        # form "class='val1 val2'" to get converted into features with a list of values.
        isurl, extstr = is_url(from_ext)
        if from_ext is not None:
            if isurl:
                from_mem = get_str_from_url(extstr)
        if from_mem:
            bs = BeautifulSoup(from_mem, parser, multi_valued_attributes=None)
        else:
            bs = BeautifulSoup(extstr, parser, multi_valued_attributes=None)
        # we recursively iterate the tree depth first, going through the children
        # and adding to a list that either contains the text or a dict with the information
        # about annotations we want to add
        nlels = {
            "pre",
            "br",
            "p",
            "div",
            "tr",
            "h1",
            "h2",
            "h3",
            "h4",
            "h5",
            "h6",
            "li",
            "address",
            "article",
            "aside",
            "blockquote",
            "del",
            "figure",
            "figcaption",
            "footer",
            "header",
            "hr",
            "ins",
            "main",
            "nav",
            "section",
            "summary",
            "input",
            "legend",
            "option",
            "textarea",
            "bdi",
            "bdo",
            "center",
            "code",
            "dfn",
            "menu",
            "dir",
            "caption",
        }
        ignoreels = {"script", "style"}
        docinfo = {"anninfos": [], "curoffset": 0, "curid": 0, "text": ""}

        def walktree(el):
            """

            Args:
              el:

            Returns:

            """
            # print("DEBUG: type=", type(el))
            if isinstance(el, bs4.element.Doctype):
                # print("DEBUG: got doctype", type(el))
                pass
            elif isinstance(el, bs4.element.Comment):
                # print("DEBUG: got Comment", type(el))
                pass
            elif isinstance(el, bs4.element.Script):
                # print("DEBUG: got Script", type(el))
                pass
            elif isinstance(el, bs4.element.Tag):
                # print("DEBUG: got tag: ", type(el), " name=",el.name)
                # some tags we ignore completely:
                if el.name in ignoreels:
                    return
                # for some tags we insert a new line before, but only if we do not already have one
                if not docinfo["text"].endswith("\n") and el.name in nlels:
                    docinfo["text"] += "\n"
                    # print("DEBUG: adding newline before at ", docinfo["curoffset"])
                    docinfo["curoffset"] += 1
                ann = {
                    "type": el.name,
                    "features": el.attrs,
                    "id": docinfo["curid"],
                    "event": "start",
                    "start": docinfo["curoffset"],
                }
                thisid = docinfo["curid"]
                docinfo["anninfos"].append(ann)
                docinfo["curid"] += 1
                for child in el.children:
                    walktree(child)
                # for some tags we insert a new line after
                if not docinfo["text"].endswith("\n") and el.name in nlels:
                    docinfo["text"] += "\n"
                    # print("DEBUG: adding newline after at ", docinfo["curoffset"])
                    docinfo["curoffset"] += 1
                docinfo["anninfos"].append({
                    "event": "end",
                    "id": thisid,
                    "end": docinfo["curoffset"]
                })
            elif isinstance(el, bs4.element.NavigableString):
                # print("DEBUG: got text: ", el)
                text = str(el)
                if text == "\n" and docinfo["text"].endswith("\n"):
                    return
                docinfo["text"] += text
                docinfo["curoffset"] += len(el)
            else:
                print("WARNING: odd element type", type(el))

        walktree(bs)
        # need to add the end corresponding to bs
        # print("DEBUG: got docinfo:\n",docinfo)
        id2anninfo = {}  # from id to anninfo
        nstart = 0
        for anninfo in docinfo["anninfos"]:
            if anninfo["event"] == "start":
                nstart += 1
                id2anninfo[anninfo["id"]] = anninfo
        nend = 0
        for anninfo in docinfo["anninfos"]:
            if anninfo["event"] == "end":
                nend += 1
                end = anninfo["end"]
                annid = anninfo["id"]
                anninfo = id2anninfo[annid]
                anninfo["end"] = end
        # print("DEBUG: got nstart/nend", nstart, nend)
        assert nstart == nend
        # print("DEBUG: got id2anninfo:\n", id2anninfo)
        doc = Document(docinfo["text"])
        annset = doc.annset(markup_set_name)
        for i in range(nstart):
            anninfo = id2anninfo[i]
            annset.add(
                anninfo["start"],
                anninfo["end"],
                anntype=anninfo["type"],
                features=anninfo["features"],
            )
        return doc
Ejemplo n.º 8
0
    def load(clazz,
             from_ext=None,
             from_mem=None,
             include_fields=None,
             include_entities=True,
             include_quote=False,
             outsetname="Original markups",
             tweet_ann="Tweet"):
        """
        Load a tweet from Twitter JSON format.

        IMPORTANT: this is still very experimental, will change in the future!

        Args:
            clazz: internal use
            from_ext: the file/url to load from
            from_mem: string to load from
            include_fields: a list of fields to include where nested field names are dot-separated, e.g.
               "user.location". All these fields are included using the nested field name in either the
               features of the tweet annotation with the Type specified, or the features of the document
               if `tweet_ann` is None.
            include_entities: create annotations for the tweet entities in the set with outsetname
            include_quote: if True, add the quoted tweet after an empty line and treat it as a separate
               tweet just like the original tweet.
            outset: the annotation set where to put entity annotations and the tweet annotation(s)
            tweet_ann: the annotation type to use to span the tweet and contain all the features.

        Returns:
            document representing the tweet
        """
        if from_ext is not None:
            isurl, extstr = is_url(from_ext)
            if isurl:
                jsonstr = get_str_from_url(extstr, encoding="utf-8")
                tweet = json.loads(jsonstr)
            else:
                with open(extstr, "rt", encoding="utf-8") as infp:
                    tweet = json.load(infp)
        elif from_mem is not None:
            tweet = json.loads(from_mem)
        else:
            raise Exception("Cannot load from None")
        if tweet is None:
            raise Exception("Could not decode Tweet JSON")
        if tweet.get("truncated"):
            text = get_nested(tweet, "extended_tweet.full_text")
        else:
            text = get_nested(tweet, "text")
        if text is None:
            raise Exception("No text field found")
        quoted_status = None
        if include_quote:
            quoted_status = tweet.get("quoted_status")
            if quoted_status is not None:
                qtext = quoted_status.get("text", "")
                text += "\n" + qtext
        doc = Document(text)
        anns = doc.annset(outsetname)
        if tweet_ann:
            ann = anns.add(0, len(text), tweet_ann)
            features = ann.features
        else:
            features = doc.features
        if include_fields is None:
            include_fields = TWITTER_DEFAULT_INCLUDE_FIELDS
        for field in include_fields:
            if field.startswith("$"):
                if field == "$is_retweet_status":
                    rs = get_nested(tweet, "retweeted_status", silent=True)
                    if rs is not None:
                        features[field] = True
                continue
            val = get_nested(tweet, field, silent=True)
            if val is not None:
                features[field] = val
        if include_entities:
            if tweet.get("truncated"):
                entities = get_nested(tweet,
                                      "extended_tweet.entities",
                                      default={})
            else:
                entities = get_nested(tweet, "entities", default={})
        for etype, elist in entities.items():
            for ent in elist:
                start, end = ent["indices"]
                anns.add(start, end, etype)
        # TODO: if we have a quoted_status, add features and entities from there:
        # Essentially the same processing as for the original tweet, but at document offset
        # len(tweet)+1 (2?)
        return doc
Ejemplo n.º 9
0
from gatenlp.document import Document
from gatenlp.docformats import simplejson
from gatenlp.offsetmapper import OFFSET_TYPE_JAVA

doc1 = Document("Just a simple \U0001F4A9 document.")
annset1 = doc1.annset("")
ann1 = annset1.add(0, 4, "Token", {"n": 1, "upper": True})
ann2 = annset1.add(5, 6, "Token", {"n": 2, "upper": False})
ann3 = annset1.add(7, 13, "Token", {"n": 3, "upper": False})
ann4 = annset1.add(14, 15, "Token", {"n": 4, "upper": False, "isshit": True})
ann5 = annset1.add(16, 24, "Token", {"n": 5})
annset2 = doc1.annset("Set2")
annset2.add(0, 12, "Ann1", None)
annset1.remove(ann2.id)
annset1.get(ann3.id).set_feature("str", "simple")
doc1.set_feature("docfeature1", "value1")
doc1.set_feature("docfeature1", "value1b")
simplejson.dump_file(doc1, "doc1.bdocjson")
simplejson.dump_file(doc1, "doc1.bdocjson.gz")