Esempio n. 1
0
    def test_interaction01_01(self):
        @GateNlpPr
        def do_it(doc: Document, **kwargs):
            set1 = doc.annset("Set1")
            set1.add(2, 3, "test1", {"f1": "value1"})
            # return nothing

        doc1 = Document("Just a simple document")
        doc1.changelog = ChangeLog()
        mypr = gatenlp.gate_python_plugin_pr
        mypr.start({"k1": "v1"})  # set the script parms
        mypr.execute(doc1)
        assert doc1._annotation_sets is not None
        assert len(doc1._annotation_sets) == 1
        assert "Set1" in doc1._annotation_sets
        myset = doc1.annset("Set1")
        assert len(myset) == 1
        myanns = myset.start_ge(0)
        assert len(myanns) == 1
        myann = next(iter(myanns))
        assert myann is not None
        assert myann.start == 2
        assert myann.end == 3
        assert myann.type == "test1"
        # assert myann.id == 1
        assert "f1" in myann.features
        assert myann.features["f1"] == "value1"
        mychlog = doc1.changelog
        assert mychlog is not None
        assert len(mychlog) == 1
        mypr.finish()
Esempio n. 2
0
    def test05(self):
        # Rules and Pampac

        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann1")  # 0
        doc.annset().add(2, 4, "Ann2")  # 1
        doc.annset().add(3, 5, "Ann2")  # 2
        doc.annset().add(4, 5, "Ann2")  # 3
        doc.annset().add(8, 10, "Ann2")  # 4
        annset = doc.annset()
        orig_len = len(annset)
        annlist = list(doc.annset())

        # first make sure the pattern works as we want
        ctx = Context(doc=doc, anns=annlist)
        pat1 = AnnAt("Ann2", name="a1") >> AnnAt("Ann2", name="a2")
        loc = ctx.inc_location(Location(0, 0), by_offset=1)
        ret = pat1.parse(location=loc, context=ctx)

        def r1_action(succ, context=None, **kwargs):
            span = succ[0].span
            ann = succ.context.outset.add(span.start, span.end, "NEW")
            return ann

        r1 = Rule(AnnAt("Ann2") >> AnnAt("Ann2"), r1_action)
        pampac = Pampac(r1)
        pampac.set_skip = "longest"
        pampac.set_select = "first"
        outset = doc.annset()
        ret = pampac.run(doc, annlist, outset=outset, debug=True)
        assert len(ret) == 1
        assert len(ret[0]) == 2
        idx, retlist = ret[0]
        assert idx == 1
        assert len(retlist) == 1
        a = retlist[0]
        assert isinstance(a, Annotation)
        assert a.start == 2
        assert a.end == 5
        assert a.type == "NEW"
        assert len(outset) == orig_len + 1
Esempio n. 3
0
    def test01(self):
        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann")
        doc.annset().add(0, 1, "Ann")
        doc.annset().add(1, 2, "Ann")
        doc.annset().add(1, 2, "Token")
        doc.annset().add(2, 3, "Ann")
        annlist = list(doc.annset())

        ctx = Context(doc, annlist)
        parser = Ann(name="a1")
        ret = parser.parse(Location(), ctx)
        assert isinstance(ret, Success)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 2
        assert loc.ann_location == 1
        assert len(ret[0].data) == 1

        # do this with the match method
        ret = parser(doc, annlist)
        assert isinstance(ret, Success)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 2
        assert loc.ann_location == 1
        assert len(ret[0].data) == 1

        # this does NOT first advance the annotation index so the annotation start index
        # is at least 2. So it matches the annotation at index 1 which ends at 1 which is
        # BEFORE the text index we have now.
        assert loc == Location(2, 1)
        ret = Ann(name="tmp1", useoffset=False).parse(loc, ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc == Location(1, 2)
        assert len(ret[0].data) == 1

        # by default we do advance, so we match the last annotation and end up at text
        # position 4 looking for annotation index 5
        loc = Location(2, 1)
        ret = Ann(name="tmp1", useoffset=True).parse(loc, ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc == Location(3, 5)
        assert len(ret[0].data) == 1

        # Try to fail
        parser = Ann("Token")
        ret = parser(doc, annlist)
        assert isinstance(ret, Failure)

        # Same without a name: should generate the same locations, but no data
        parser = Ann()
        ret = parser.parse(Location(), ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 2
        assert loc.ann_location == 1
        assert len(ret[0].data) == 0

        ret = Ann().parse(loc, ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 3
        assert loc.ann_location == 5
        assert len(ret[0].data) == 0

        parser = AnnAt(name="a2")
        ret = parser.parse(Location(), ctx)
        assert len(ret) == 1
        assert len(ret[0].data) == 1

        parser = AnnAt(matchtype="all", name="a3")
        ret = parser.parse(Location(), ctx)
        assert len(ret) == 2
        assert len(ret[0].data) == 1
        assert len(ret[1].data) == 1

        # Try Rule
        parser = Ann(name="a1")
        tmp = dict(i=0)

        def rhs1(succ, **kwargs):
            tmp["i"] = 1

        rule = Call(parser, rhs1)
        ret = rule.parse(Location(), ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 2
        assert loc.ann_location == 1
        assert len(ret[0].data) == 1
        assert tmp["i"] == 1

        # use the call method instead
        def rhs2(succ, **kwargs):
            tmp["i"] = 2

        parser = Ann(name="a1").call(rhs2)
        ret = parser.parse(Location(), ctx)
        print(ret)
        assert tmp["i"] == 2

        parser = Find(AnnAt(type="Token", name="at"), by_anns=False)
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Find(AnnAt(type="Token", name="at"), by_anns=True)
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Find(Text("document", name="t1"), by_anns=False)
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Seq(Ann("Ann", name="a1"),
                     Ann("Ann", name="a2"),
                     matchtype="longest")
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = N(AnnAt("Ann", name="a1"), 1, 5, matchtype="first")
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Or(Ann("X", name="x1"), Ann("Ann", name="a1"))
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Ann("X", name="x1") | Ann("Y", name="y1") | Ann("Ann",
                                                                 name="a1")
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Ann("Ann", name="a1") >> Ann("Ann", name="a2")
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Ann("Ann", name="a1") * 2
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Ann("Ann", name="a1") * (1, 3)
        ret = parser.parse(Location(), ctx)
        print(ret)
Esempio n. 4
0
    def test04(self):
        # Test multiple result matches with N, with and without the until clause

        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann")  # 0
        doc.annset().add(0, 2, "Ann")  # 1
        doc.annset().add(0, 2, "Token")  # 2
        doc.annset().add(2, 4, "Ann")  # 3
        doc.annset().add(2, 4, "Ann")  # 4
        doc.annset().add(4, 6, "Ann")  # 5
        doc.annset().add(4, 6, "Ann")  # 6
        doc.annset().add(4, 6, "Person")  # 7
        doc.annset().add(6, 8, "Ann")  # 8
        doc.annset().add(6, 8, "Ann")  # 9
        doc.annset().add(8, 10, "XXXX")  # 10
        annlist = list(doc.annset())

        # multiple Anns, single result from N: first
        # This should find 0,3,5
        ret = N(
            AnnAt("Ann", name="a1", matchtype="all"),
            min=2,
            max=3,
            select="all",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 3
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5

        # multiple Anns, all results from N
        # should return 0,1
        ret = N(
            AnnAt("Ann", name="a1", matchtype="all"),
            min=1,
            max=1,
            select="all",
            matchtype="all",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 2
        assert len(ret[0].data) == 1
        assert len(ret[1].data) == 1
        assert ret[0].data[0]["ann"].id == 0
        assert ret[1].data[0]["ann"].id == 1

        # multiple Anns, all results from N
        ret = N(
            AnnAt("Ann", name="a1", matchtype="all"),
            min=1,
            max=2,
            select="all",
            matchtype="all",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 4
        assert len(ret[0].data) == 2
        assert len(ret[1].data) == 2
        assert len(ret[2].data) == 2
        assert len(ret[3].data) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[1].data[0]["ann"].id == 0
        assert ret[1].data[1]["ann"].id == 4
        assert ret[2].data[0]["ann"].id == 1
        assert ret[2].data[1]["ann"].id == 3
        assert ret[3].data[0]["ann"].id == 1
        assert ret[3].data[1]["ann"].id == 4

        # multiple Anns, all results from N
        # just three for the first ann: 0,1,2
        ret = N(
            AnnAt(name="a1", matchtype="all"),
            min=1,
            max=1,
            select="all",
            matchtype="all",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 3
        assert len(ret[0].data) == 1
        assert len(ret[1].data) == 1
        assert len(ret[2].data) == 1
        assert ret[0].data[0]["ann"].id == 0
        assert ret[1].data[0]["ann"].id == 1
        assert ret[2].data[0]["ann"].id == 2

        # This should just find the Token as the first and only match!
        ret = N(AnnAt("Ann", name="a1", matchtype="all"),
                until=AnnAt("Token", name="t", matchtype="first"),
                min=0,
                max=3,
                select="all",
                matchtype="all").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 1
        assert ret[0].data[0]["ann"].id == 2

        # This should terminate with Person and find all paths that can lead up to PErson:
        # 0,3 0,4 1,3 1,4
        ret = N(AnnAt("Ann", name="a1", matchtype="all"),
                until=AnnAt("Person", name="t", matchtype="first"),
                min=1,
                max=3,
                select="all",
                matchtype="all").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 4
        assert len(ret[0].data) == 3
        assert len(ret[1].data) == 3
        assert len(ret[2].data) == 3
        assert len(ret[3].data) == 3
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 7
        assert ret[1].data[0]["ann"].id == 0
        assert ret[1].data[1]["ann"].id == 4
        assert ret[1].data[2]["ann"].id == 7
        assert ret[2].data[0]["ann"].id == 1
        assert ret[2].data[1]["ann"].id == 3
        assert ret[2].data[2]["ann"].id == 7
        assert ret[3].data[0]["ann"].id == 1
        assert ret[3].data[1]["ann"].id == 4
        assert ret[3].data[2]["ann"].id == 7
Esempio n. 5
0
    def test03(self):
        # Test single result matches with N, with and without the until clause

        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann")  # 0
        doc.annset().add(0, 2, "Ann")  # 1
        doc.annset().add(0, 2, "Token")  # 2
        doc.annset().add(2, 4, "Ann")  # 3
        doc.annset().add(2, 4, "Ann")  # 4
        doc.annset().add(4, 6, "Ann")  # 5
        doc.annset().add(4, 6, "Ann")  # 6
        doc.annset().add(4, 6, "Person")  # 7
        doc.annset().add(6, 8, "Ann")  # 8
        doc.annset().add(6, 8, "Ann")  # 9
        doc.annset().add(8, 10, "XXXX")  # 10
        annlist = list(doc.annset())

        # single Ann, single result from N
        # this should return annotation ids 0, 3, 5
        ret = N(
            AnnAt("Ann", name="a1", matchtype="first"),
            min=2,
            max=3,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 3
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5

        # Same as before, but with a name, so we should get one additional data for the whole sequence
        # with a span
        ret = N(AnnAt("Ann", name="a1", matchtype="first"),
                min=2,
                max=3,
                select="first",
                matchtype="first",
                name="n1").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 4
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5
        assert ret[0].data[3]["span"] == Span(0, 6)

        # single Ann, single result from N
        # this should return annotation ids 0, 3, 5, 8
        ret = N(
            AnnAt("Ann", name="a1", matchtype="first"),
            min=2,
            max=99,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 4
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5
        assert ret[0].data[3]["ann"].id == 8

        # single Ann, single result from N, with early stopping at Person
        # this should return annotation ids 0, 3, 7
        ret = N(
            AnnAt("Ann", name="a1", matchtype="first"),
            until=AnnAt("Person", name="p"),
            min=2,
            max=99,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 3
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 7

        # Try a match with min=0 and max=99 that does not succeed
        # single Ann, single result from N
        # this should return an empty list for data
        ret = N(
            AnnAt("NotThere", name="a1", matchtype="first"),
            min=0,
            max=99,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 0

        # Try a match with min=0 and max=99 that does not succeed
        # single Ann, single result from N
        # this should return an empty list for data
        ret = N(
            AnnAt("Ann", name="a1", matchtype="first"),
            min=0,
            max=99,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 4
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5
        assert ret[0].data[3]["ann"].id == 8
Esempio n. 6
0
    def test02(self):
        # Test multiple result matches

        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann")  # 0
        doc.annset().add(0, 2, "Ann")  # 1
        doc.annset().add(0, 2, "Token")  # 2
        doc.annset().add(2, 4, "Ann")  # 3
        doc.annset().add(2, 4, "Ann")  # 4
        annlist = list(doc.annset())

        # match all annotations at the document start
        ret = AnnAt(matchtype="all").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 3

        # match sequence Token/Ann, take first at each point
        # this should match annotation ids 2 and 3
        ret = Seq(AnnAt("Token", name="1"),
                  AnnAt("Ann", name="2")).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 2
        assert ret[0].data[0]["ann"].id == 2
        assert ret[0].data[1]["ann"].id == 3

        # match sequence Ann/Ann, take first at each point
        ret = Seq(AnnAt("Ann", name="1"), AnnAt("Ann",
                                                name="2")).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3

        # match sequence Ann/Ann, take first at each point, set useoffset=False so we do not skip to the
        # end offset of the previous before matching the next
        # In that case the next ann we match is the second one at offset 0
        ret = Seq(AnnAt("Ann", name="1"),
                  AnnAt("Ann", name="2", useoffset=False)).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 1

        # Make sure we get the correct set of annotations at position 0 and 2
        ret = AnnAt("Ann", name="a", matchtype="all").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[1].data[0]["ann"].id == 1
        # ret.pprint()
        ret = AnnAt("Ann", name="a",
                    matchtype="all").match(doc,
                                           annlist,
                                           location=Location(2, 2))
        assert ret.issuccess()
        assert len(ret) == 2
        assert ret[0].data[0]["ann"].id == 3
        assert ret[1].data[0]["ann"].id == 4
        # ret.pprint()

        # Match sequence of two anns in order, take all results
        ret = Seq(
            AnnAt("Ann", name="1", matchtype="all"),
            AnnAt("Ann", name="2", matchtype="all"),
            select="all",
            matchtype="all",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 4
        assert len(ret[0].data) == 2
        assert len(ret[1].data) == 2
        assert len(ret[2].data) == 2
        assert len(ret[3].data) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[1].data[0]["ann"].id == 0
        assert ret[1].data[1]["ann"].id == 4
        assert ret[2].data[0]["ann"].id == 1
        assert ret[2].data[1]["ann"].id == 3
        assert ret[3].data[0]["ann"].id == 1
        assert ret[3].data[1]["ann"].id == 4
def stanfordnlp2gatenlp(
    stanfordnlpdoc,
    gatenlpdoc=None,
    setname="",
    word_type="Word",
    sentence_type="Sentence",
):
    """Convert a StanfordNLP document to a gatenlp document. If a gatenlp document is already
    provided, add the annotations from the StanfordNLP document to it. In this case the
    original gatenlpdoc is used and gets modified.

    Args:
      stanfordnlpdoc: a StanfordNLP document
      gatenlpdoc: if None, a new gatenlp document is created otherwise this
    document is added to. (Default value = None)
      setname: the annotation set name to which the annotations get added, empty string
    for the default annotation set.
      token_type: the annotation type to use for tokens
      sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
      word_type:  (Default value = "Word")

    Returns:
      the new or modified

    """
    if gatenlpdoc is None:
        retdoc = Document(stanfordnlpdoc.text)
    else:
        retdoc = gatenlpdoc
    toki2annid = {}
    annset = retdoc.annset(setname)
    # stanford nlp processes text in sentence chunks, so we do everything per sentence
    # NOTE: the stanford elements do not contain any text offsets, so we have to match and find
    # them ourselves. for this we keep an index to first character in the text which has not
    # been matched yet
    notmatchedidx = 0
    for sent in stanfordnlpdoc.sentences:
        # a sentence is a list of tokens and a list of words. Some tokens consist of several words.
        # dependency parsers are over words, so we create Word and Token annotations, but we only
        # set the features per Word annotation for now.
        offsetinfos = utils.match_substrings(
            stanfordnlpdoc.text[notmatchedidx:],
            sent.words,
            getstr=lambda x: x.text)
        idx2annid = {}
        for oinfo in offsetinfos:
            word = oinfo[2]
            fm = {
                "string": word.text,
                "lemma": word.lemma,
                "upos": word.upos,
                "xpos": word.xpos,
                "dependency_relation": word.dependency_relation,
                "governor": int(word.governor),
            }
            for feat in word.feats.split("|"):
                if feat and feat != "_":
                    k, v = feat.split("=")
                    # TODO: maybe try to detect and convert bool/int values
                    fm["feat_" + k] = v
            snlp_idx = int(word.index)
            annid = annset.add(oinfo[0] + notmatchedidx,
                               oinfo[1] + notmatchedidx, word_type, fm).id
            idx2annid[snlp_idx] = annid
        # create a sentence annotation from beginning of first word to end of last
        sentid = annset.add(
            offsetinfos[0][0] + notmatchedidx,
            offsetinfos[-1][1] + notmatchedidx,
            sentence_type,
        ).id
        # now replace the governor index with the corresponding annid, the governor index is
        # mapped to the sentence annotation
        idx2annid[0] = sentid
        for annid in list(idx2annid.values()):
            ann = annset.get(annid)
            gov = ann.features.get("governor")
            if gov is not None:
                ann.features["governor"] = idx2annid[gov]
        notmatchedidx = offsetinfos[-1][1] + notmatchedidx + 1
    return retdoc
Esempio n. 8
0
 def do_it(doc: Document, **kwargs):
     set1 = doc.annset("Set1")
     set1.add(2, 3, "test1", {"f1": "value1"})
Esempio n. 9
0
def stanza2gatenlp(
    stanzadoc,
    gatenlpdoc=None,
    setname="",
    token_type="Token",
    sentence_type="Sentence",
    add_entities=True,
    ent_prefix=None,
):
    """
    Convert a Stanford Stanza document to a gatenlp document. If a gatenlp document is already
    provided, add the annotations from the Stanford Stanza document to it. In this case the
    original gatenlpdoc is used and gets modified.

    Args:
      stanzadoc: a Stanford Stanza document
      gatenlpdoc: if None, a new gatenlp document is created otherwise this
         document is added to. (Default value = None)
      setname: the annotation set name to which the annotations get added, empty string
         for the default annotation set.
      token_type: the annotation type to use for tokens, if needed (Default value = "Token")
      sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
      add_entities: if True, add any entities as well (Default value = True)
      ent_prefix: if None, use the original entity type as annotation type, otherwise add the given string
    to the annotation type as a prefix. (Default value = None)

    Returns:
      the new or modified gatenlp document

    """
    if gatenlpdoc is None:
        retdoc = Document(stanzadoc.text)
    else:
        retdoc = gatenlpdoc
    toki2annid = {}
    annset = retdoc.annset(setname)
    # stanford nlp processes text in sentence chunks, so we do everything per sentence
    notmatchedidx = 0
    for sent in stanzadoc.sentences:
        # go through the tokens: in stanza, each token is a list of dicts, normally there is one dict
        # which also has the offset information in "misc", but for multiword tokens, there seems to be
        # one "header" dict for the range of words which has the offset info and NER label and then
        # one additional element per word which has all the rest.
        # For our purposes we create a list of dicts where for normal tokens we just copy the element, but for
        # multiword tokens we copy over something that has fake offsets and all the features
        newtokens = []
        for t in sent.tokens:
            t = t.to_dict()
            if len(t) == 1:
                newtokens.append(tok2tok(t[0]))
            else:
                tokinfo = tok2tok(t[0])
                words = t[1:]
                fm = tokinfo.get("fm")
                ner = fm.get("ner")
                text = fm.get("text")
                start = tokinfo["start"]
                end = tokinfo["end"]
                for i, w in enumerate(words):
                    tok = tok2tok(w)
                    tok["fm"]["ner"] = ner
                    tok["fm"]["token_text"] = text
                    os = min(start + i, end - 1)
                    tok["start"] = os
                    if i == len(words) - 1:
                        tok["end"] = end
                    else:
                        tok["end"] = os + 1
                    newtokens.append(tok)
        # print(f"\n!!!!!!DEBUG: newtokens={newtokens}")
        # now go through the new token list and create annotations
        idx2annid = {}  # map stanza word id to annotation id
        starts = []
        ends = []
        for t in newtokens:
            start = t["start"]
            end = t["end"]
            stanzaid = t["id"]
            starts.append(start)
            ends.append(end)
            annid = annset.add(start, end, token_type, features=t["fm"]).id
            idx2annid[str(stanzaid)] = annid
        # print(f"\n!!!!!!DEBUG: idx2annid={idx2annid}")
        # create a sentence annotation from beginning of first word to end of last
        sentid = annset.add(starts[0], ends[-1], sentence_type).id
        # now replace the head index with the corresponding annid, the head index "0" is
        # mapped to the sentence annotation
        idx2annid["0"] = sentid
        for annid in list(idx2annid.values()):
            ann = annset.get(annid)
            hd = ann.features.get("head")
            if hd is not None:
                hd = str(hd)
                headId = idx2annid.get(hd)
                if headId is None:
                    logger.error(
                        f"Could not find head id: {hd} for {ann} in document {gatenlpdoc.name}"
                    )
                else:
                    ann.features["head"] = idx2annid[hd]

    # add the entities
    if add_entities:
        for e in stanzadoc.entities:
            if ent_prefix:
                anntype = ent_prefix + e.type
            else:
                anntype = e.type
            annset.add(e.start_char, e.end_char, anntype)
    return retdoc
Esempio n. 10
0
def spacy2gatenlp(
    spacydoc,
    gatenlpdoc=None,
    setname="",
    token_type="Token",
    spacetoken_type="SpaceToken",
    sentence_type="Sentence",
    nounchunk_type="NounChunk",
    add_tokens=True,
    # add_spacetokens=True, # not sure how to do this yet
    add_ents=True,
    add_sents=True,
    add_nounchunks=True,
    add_dep=True,
    ent_prefix=None,
):
    """Convert a spacy document to a gatenlp document. If a gatenlp document is already
    provided, add the annotations from the spacy document to it. In this case the
    original gatenlpdoc is used and gets modified.

    Args:
      spacydoc: a spacy document
      gatenlpdoc: if None, a new gatenlp document is created otherwise this
    document is added to. (Default value = None)
      setname: the annotation set name to which the annotations get added, empty string
    for the default annotation set.
      token_type: the annotation type to use for tokens (Default value = "Token")
      spacetoken_type: the annotation type to use for space tokens (Default value = "SpaceToken")
      sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
      nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk")
      add_tokens: should annotations for tokens get added? If not, dependency parser
    info cannot be added either. (Default value = True)
      add_ents: should annotations for entities get added
      add_sents: should sentence annotations get added (Default value = True)
      add_nounchunks: should noun chunk annotations get added (Default value = True)
      add_dep: should dependency parser information get added (Default value = True)
      # add_spacetokens:  (Default value = True)
      # not sure how to do this yetadd_ents:  (Default value = True)
      ent_prefix:  (Default value = None)

    Returns:
      the new or modified

    """
    if gatenlpdoc is None:
        retdoc = Document(spacydoc.text)
    else:
        retdoc = gatenlpdoc
    toki2annid = {}
    annset = retdoc.annset(setname)
    for tok in spacydoc:
        from_off = tok.idx
        to_off = tok.idx + len(tok)
        is_space = tok.is_space
        fm = {
            "_i": tok.i,
            "is_alpha": tok.is_alpha,
            "is_bracket": tok.is_bracket,
            "is_currency": tok.is_currency,
            "is_digit": tok.is_digit,
            "is_left_punct": tok.is_left_punct,
            "is_lower": tok.is_lower,
            "is_oov": tok.is_oov,
            "is_punct": tok.is_punct,
            "is_quote": tok.is_quote,
            "is_right_punct": tok.is_right_punct,
            "is_sent_start": tok.is_sent_start,
            "is_space": tok.is_space,
            "is_stop": tok.is_stop,
            "is_title": tok.is_title,
            "is_upper": tok.is_upper,
            "lang": tok.lang_,
            "lemma": tok.lemma_,
            "like_email": tok.like_email,
            "like_num": tok.like_num,
            "like_url": tok.like_url,
            "orth": tok.orth,
            "pos": tok.pos_,
            "prefix": tok.prefix_,
            "prob": tok.prob,
            "rank": tok.rank,
            "sentiment": tok.sentiment,
            "tag": tok.tag_,
            "shape": tok.shape_,
            "suffix": tok.suffix_,
        }
        if spacydoc.is_nered and add_ents:
            fm["ent_type"] = tok.ent_type_
        if spacydoc.is_parsed and add_dep:
            fm["dep"] = tok.dep_
        if tok.is_space:
            anntype = spacetoken_type
        else:
            anntype = token_type
        annid = annset.add(from_off, to_off, anntype, fm).id
        toki2annid[tok.i] = annid
        # print("Added annotation with id: {} for token {}".format(annid, tok.i))
        ws = tok.whitespace_
        if len(ws) > 0:
            annset.add(to_off, to_off + len(ws), spacetoken_type, {"is_space": True})
    # if we have a dependency parse, now also add the parse edges
    if spacydoc.is_parsed and add_tokens and add_dep:
        for tok in spacydoc:
            ann = annset.get(toki2annid[tok.i])
            ann.features["head"] = toki2annid[tok.head.i]
            ann.features["left_edge"] = toki2annid[tok.left_edge.i]
            ann.features["right_edge"] = toki2annid[tok.right_edge.i]
    if spacydoc.ents and add_ents:
        for ent in spacydoc.ents:
            if ent_prefix:
                entname = ent_prefix + ent.label_
            else:
                entname = ent.label_
            annset.add(ent.start_char, ent.end_char, entname, {"lemma": ent.lemma_})
    if spacydoc.sents and add_sents:
        for sent in spacydoc.sents:
            annset.add(sent.start_char, sent.end_char, sentence_type, {})
    if spacydoc.noun_chunks and add_nounchunks:
        for chunk in spacydoc.noun_chunks:
            annset.add(chunk.start_char, chunk.end_char, nounchunk_type, {})
    return retdoc