def test_interaction01_01(self): @GateNlpPr def do_it(doc: Document, **kwargs): set1 = doc.annset("Set1") set1.add(2, 3, "test1", {"f1": "value1"}) # return nothing doc1 = Document("Just a simple document") doc1.changelog = ChangeLog() mypr = gatenlp.gate_python_plugin_pr mypr.start({"k1": "v1"}) # set the script parms mypr.execute(doc1) assert doc1._annotation_sets is not None assert len(doc1._annotation_sets) == 1 assert "Set1" in doc1._annotation_sets myset = doc1.annset("Set1") assert len(myset) == 1 myanns = myset.start_ge(0) assert len(myanns) == 1 myann = next(iter(myanns)) assert myann is not None assert myann.start == 2 assert myann.end == 3 assert myann.type == "test1" # assert myann.id == 1 assert "f1" in myann.features assert myann.features["f1"] == "value1" mychlog = doc1.changelog assert mychlog is not None assert len(mychlog) == 1 mypr.finish()
def test05(self): # Rules and Pampac doc = Document("Some test document") doc.annset().add(0, 2, "Ann1") # 0 doc.annset().add(2, 4, "Ann2") # 1 doc.annset().add(3, 5, "Ann2") # 2 doc.annset().add(4, 5, "Ann2") # 3 doc.annset().add(8, 10, "Ann2") # 4 annset = doc.annset() orig_len = len(annset) annlist = list(doc.annset()) # first make sure the pattern works as we want ctx = Context(doc=doc, anns=annlist) pat1 = AnnAt("Ann2", name="a1") >> AnnAt("Ann2", name="a2") loc = ctx.inc_location(Location(0, 0), by_offset=1) ret = pat1.parse(location=loc, context=ctx) def r1_action(succ, context=None, **kwargs): span = succ[0].span ann = succ.context.outset.add(span.start, span.end, "NEW") return ann r1 = Rule(AnnAt("Ann2") >> AnnAt("Ann2"), r1_action) pampac = Pampac(r1) pampac.set_skip = "longest" pampac.set_select = "first" outset = doc.annset() ret = pampac.run(doc, annlist, outset=outset, debug=True) assert len(ret) == 1 assert len(ret[0]) == 2 idx, retlist = ret[0] assert idx == 1 assert len(retlist) == 1 a = retlist[0] assert isinstance(a, Annotation) assert a.start == 2 assert a.end == 5 assert a.type == "NEW" assert len(outset) == orig_len + 1
def test01(self): doc = Document("Some test document") doc.annset().add(0, 2, "Ann") doc.annset().add(0, 1, "Ann") doc.annset().add(1, 2, "Ann") doc.annset().add(1, 2, "Token") doc.annset().add(2, 3, "Ann") annlist = list(doc.annset()) ctx = Context(doc, annlist) parser = Ann(name="a1") ret = parser.parse(Location(), ctx) assert isinstance(ret, Success) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 2 assert loc.ann_location == 1 assert len(ret[0].data) == 1 # do this with the match method ret = parser(doc, annlist) assert isinstance(ret, Success) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 2 assert loc.ann_location == 1 assert len(ret[0].data) == 1 # this does NOT first advance the annotation index so the annotation start index # is at least 2. So it matches the annotation at index 1 which ends at 1 which is # BEFORE the text index we have now. assert loc == Location(2, 1) ret = Ann(name="tmp1", useoffset=False).parse(loc, ctx) assert len(ret) == 1 loc = ret[0].location assert loc == Location(1, 2) assert len(ret[0].data) == 1 # by default we do advance, so we match the last annotation and end up at text # position 4 looking for annotation index 5 loc = Location(2, 1) ret = Ann(name="tmp1", useoffset=True).parse(loc, ctx) assert len(ret) == 1 loc = ret[0].location assert loc == Location(3, 5) assert len(ret[0].data) == 1 # Try to fail parser = Ann("Token") ret = parser(doc, annlist) assert isinstance(ret, Failure) # Same without a name: should generate the same locations, but no data parser = Ann() ret = parser.parse(Location(), ctx) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 2 assert loc.ann_location == 1 assert len(ret[0].data) == 0 ret = Ann().parse(loc, ctx) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 3 assert loc.ann_location == 5 assert len(ret[0].data) == 0 parser = AnnAt(name="a2") ret = parser.parse(Location(), ctx) assert len(ret) == 1 assert len(ret[0].data) == 1 parser = AnnAt(matchtype="all", name="a3") ret = parser.parse(Location(), ctx) assert len(ret) == 2 assert len(ret[0].data) == 1 assert len(ret[1].data) == 1 # Try Rule parser = Ann(name="a1") tmp = dict(i=0) def rhs1(succ, **kwargs): tmp["i"] = 1 rule = Call(parser, rhs1) ret = rule.parse(Location(), ctx) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 2 assert loc.ann_location == 1 assert len(ret[0].data) == 1 assert tmp["i"] == 1 # use the call method instead def rhs2(succ, **kwargs): tmp["i"] = 2 parser = Ann(name="a1").call(rhs2) ret = parser.parse(Location(), ctx) print(ret) assert tmp["i"] == 2 parser = Find(AnnAt(type="Token", name="at"), by_anns=False) ret = parser.parse(Location(), ctx) print(ret) parser = Find(AnnAt(type="Token", name="at"), by_anns=True) ret = parser.parse(Location(), ctx) print(ret) parser = Find(Text("document", name="t1"), by_anns=False) ret = parser.parse(Location(), ctx) print(ret) parser = Seq(Ann("Ann", name="a1"), Ann("Ann", name="a2"), matchtype="longest") ret = parser.parse(Location(), ctx) print(ret) parser = N(AnnAt("Ann", name="a1"), 1, 5, matchtype="first") ret = parser.parse(Location(), ctx) print(ret) parser = Or(Ann("X", name="x1"), Ann("Ann", name="a1")) ret = parser.parse(Location(), ctx) print(ret) parser = Ann("X", name="x1") | Ann("Y", name="y1") | Ann("Ann", name="a1") ret = parser.parse(Location(), ctx) print(ret) parser = Ann("Ann", name="a1") >> Ann("Ann", name="a2") ret = parser.parse(Location(), ctx) print(ret) parser = Ann("Ann", name="a1") * 2 ret = parser.parse(Location(), ctx) print(ret) parser = Ann("Ann", name="a1") * (1, 3) ret = parser.parse(Location(), ctx) print(ret)
def test04(self): # Test multiple result matches with N, with and without the until clause doc = Document("Some test document") doc.annset().add(0, 2, "Ann") # 0 doc.annset().add(0, 2, "Ann") # 1 doc.annset().add(0, 2, "Token") # 2 doc.annset().add(2, 4, "Ann") # 3 doc.annset().add(2, 4, "Ann") # 4 doc.annset().add(4, 6, "Ann") # 5 doc.annset().add(4, 6, "Ann") # 6 doc.annset().add(4, 6, "Person") # 7 doc.annset().add(6, 8, "Ann") # 8 doc.annset().add(6, 8, "Ann") # 9 doc.annset().add(8, 10, "XXXX") # 10 annlist = list(doc.annset()) # multiple Anns, single result from N: first # This should find 0,3,5 ret = N( AnnAt("Ann", name="a1", matchtype="all"), min=2, max=3, select="all", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 3 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 # multiple Anns, all results from N # should return 0,1 ret = N( AnnAt("Ann", name="a1", matchtype="all"), min=1, max=1, select="all", matchtype="all", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 2 assert len(ret[0].data) == 1 assert len(ret[1].data) == 1 assert ret[0].data[0]["ann"].id == 0 assert ret[1].data[0]["ann"].id == 1 # multiple Anns, all results from N ret = N( AnnAt("Ann", name="a1", matchtype="all"), min=1, max=2, select="all", matchtype="all", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 4 assert len(ret[0].data) == 2 assert len(ret[1].data) == 2 assert len(ret[2].data) == 2 assert len(ret[3].data) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[1].data[0]["ann"].id == 0 assert ret[1].data[1]["ann"].id == 4 assert ret[2].data[0]["ann"].id == 1 assert ret[2].data[1]["ann"].id == 3 assert ret[3].data[0]["ann"].id == 1 assert ret[3].data[1]["ann"].id == 4 # multiple Anns, all results from N # just three for the first ann: 0,1,2 ret = N( AnnAt(name="a1", matchtype="all"), min=1, max=1, select="all", matchtype="all", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 3 assert len(ret[0].data) == 1 assert len(ret[1].data) == 1 assert len(ret[2].data) == 1 assert ret[0].data[0]["ann"].id == 0 assert ret[1].data[0]["ann"].id == 1 assert ret[2].data[0]["ann"].id == 2 # This should just find the Token as the first and only match! ret = N(AnnAt("Ann", name="a1", matchtype="all"), until=AnnAt("Token", name="t", matchtype="first"), min=0, max=3, select="all", matchtype="all").match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 1 assert ret[0].data[0]["ann"].id == 2 # This should terminate with Person and find all paths that can lead up to PErson: # 0,3 0,4 1,3 1,4 ret = N(AnnAt("Ann", name="a1", matchtype="all"), until=AnnAt("Person", name="t", matchtype="first"), min=1, max=3, select="all", matchtype="all").match(doc, annlist) assert ret.issuccess() assert len(ret) == 4 assert len(ret[0].data) == 3 assert len(ret[1].data) == 3 assert len(ret[2].data) == 3 assert len(ret[3].data) == 3 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 7 assert ret[1].data[0]["ann"].id == 0 assert ret[1].data[1]["ann"].id == 4 assert ret[1].data[2]["ann"].id == 7 assert ret[2].data[0]["ann"].id == 1 assert ret[2].data[1]["ann"].id == 3 assert ret[2].data[2]["ann"].id == 7 assert ret[3].data[0]["ann"].id == 1 assert ret[3].data[1]["ann"].id == 4 assert ret[3].data[2]["ann"].id == 7
def test03(self): # Test single result matches with N, with and without the until clause doc = Document("Some test document") doc.annset().add(0, 2, "Ann") # 0 doc.annset().add(0, 2, "Ann") # 1 doc.annset().add(0, 2, "Token") # 2 doc.annset().add(2, 4, "Ann") # 3 doc.annset().add(2, 4, "Ann") # 4 doc.annset().add(4, 6, "Ann") # 5 doc.annset().add(4, 6, "Ann") # 6 doc.annset().add(4, 6, "Person") # 7 doc.annset().add(6, 8, "Ann") # 8 doc.annset().add(6, 8, "Ann") # 9 doc.annset().add(8, 10, "XXXX") # 10 annlist = list(doc.annset()) # single Ann, single result from N # this should return annotation ids 0, 3, 5 ret = N( AnnAt("Ann", name="a1", matchtype="first"), min=2, max=3, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 3 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 # Same as before, but with a name, so we should get one additional data for the whole sequence # with a span ret = N(AnnAt("Ann", name="a1", matchtype="first"), min=2, max=3, select="first", matchtype="first", name="n1").match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 4 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 assert ret[0].data[3]["span"] == Span(0, 6) # single Ann, single result from N # this should return annotation ids 0, 3, 5, 8 ret = N( AnnAt("Ann", name="a1", matchtype="first"), min=2, max=99, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 4 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 assert ret[0].data[3]["ann"].id == 8 # single Ann, single result from N, with early stopping at Person # this should return annotation ids 0, 3, 7 ret = N( AnnAt("Ann", name="a1", matchtype="first"), until=AnnAt("Person", name="p"), min=2, max=99, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 3 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 7 # Try a match with min=0 and max=99 that does not succeed # single Ann, single result from N # this should return an empty list for data ret = N( AnnAt("NotThere", name="a1", matchtype="first"), min=0, max=99, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 0 # Try a match with min=0 and max=99 that does not succeed # single Ann, single result from N # this should return an empty list for data ret = N( AnnAt("Ann", name="a1", matchtype="first"), min=0, max=99, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 4 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 assert ret[0].data[3]["ann"].id == 8
def test02(self): # Test multiple result matches doc = Document("Some test document") doc.annset().add(0, 2, "Ann") # 0 doc.annset().add(0, 2, "Ann") # 1 doc.annset().add(0, 2, "Token") # 2 doc.annset().add(2, 4, "Ann") # 3 doc.annset().add(2, 4, "Ann") # 4 annlist = list(doc.annset()) # match all annotations at the document start ret = AnnAt(matchtype="all").match(doc, annlist) assert ret.issuccess() assert len(ret) == 3 # match sequence Token/Ann, take first at each point # this should match annotation ids 2 and 3 ret = Seq(AnnAt("Token", name="1"), AnnAt("Ann", name="2")).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 2 assert ret[0].data[0]["ann"].id == 2 assert ret[0].data[1]["ann"].id == 3 # match sequence Ann/Ann, take first at each point ret = Seq(AnnAt("Ann", name="1"), AnnAt("Ann", name="2")).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 # match sequence Ann/Ann, take first at each point, set useoffset=False so we do not skip to the # end offset of the previous before matching the next # In that case the next ann we match is the second one at offset 0 ret = Seq(AnnAt("Ann", name="1"), AnnAt("Ann", name="2", useoffset=False)).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 1 # Make sure we get the correct set of annotations at position 0 and 2 ret = AnnAt("Ann", name="a", matchtype="all").match(doc, annlist) assert ret.issuccess() assert len(ret) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[1].data[0]["ann"].id == 1 # ret.pprint() ret = AnnAt("Ann", name="a", matchtype="all").match(doc, annlist, location=Location(2, 2)) assert ret.issuccess() assert len(ret) == 2 assert ret[0].data[0]["ann"].id == 3 assert ret[1].data[0]["ann"].id == 4 # ret.pprint() # Match sequence of two anns in order, take all results ret = Seq( AnnAt("Ann", name="1", matchtype="all"), AnnAt("Ann", name="2", matchtype="all"), select="all", matchtype="all", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 4 assert len(ret[0].data) == 2 assert len(ret[1].data) == 2 assert len(ret[2].data) == 2 assert len(ret[3].data) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[1].data[0]["ann"].id == 0 assert ret[1].data[1]["ann"].id == 4 assert ret[2].data[0]["ann"].id == 1 assert ret[2].data[1]["ann"].id == 3 assert ret[3].data[0]["ann"].id == 1 assert ret[3].data[1]["ann"].id == 4
def stanfordnlp2gatenlp( stanfordnlpdoc, gatenlpdoc=None, setname="", word_type="Word", sentence_type="Sentence", ): """Convert a StanfordNLP document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the StanfordNLP document to it. In this case the original gatenlpdoc is used and gets modified. Args: stanfordnlpdoc: a StanfordNLP document gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") word_type: (Default value = "Word") Returns: the new or modified """ if gatenlpdoc is None: retdoc = Document(stanfordnlpdoc.text) else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.annset(setname) # stanford nlp processes text in sentence chunks, so we do everything per sentence # NOTE: the stanford elements do not contain any text offsets, so we have to match and find # them ourselves. for this we keep an index to first character in the text which has not # been matched yet notmatchedidx = 0 for sent in stanfordnlpdoc.sentences: # a sentence is a list of tokens and a list of words. Some tokens consist of several words. # dependency parsers are over words, so we create Word and Token annotations, but we only # set the features per Word annotation for now. offsetinfos = utils.match_substrings( stanfordnlpdoc.text[notmatchedidx:], sent.words, getstr=lambda x: x.text) idx2annid = {} for oinfo in offsetinfos: word = oinfo[2] fm = { "string": word.text, "lemma": word.lemma, "upos": word.upos, "xpos": word.xpos, "dependency_relation": word.dependency_relation, "governor": int(word.governor), } for feat in word.feats.split("|"): if feat and feat != "_": k, v = feat.split("=") # TODO: maybe try to detect and convert bool/int values fm["feat_" + k] = v snlp_idx = int(word.index) annid = annset.add(oinfo[0] + notmatchedidx, oinfo[1] + notmatchedidx, word_type, fm).id idx2annid[snlp_idx] = annid # create a sentence annotation from beginning of first word to end of last sentid = annset.add( offsetinfos[0][0] + notmatchedidx, offsetinfos[-1][1] + notmatchedidx, sentence_type, ).id # now replace the governor index with the corresponding annid, the governor index is # mapped to the sentence annotation idx2annid[0] = sentid for annid in list(idx2annid.values()): ann = annset.get(annid) gov = ann.features.get("governor") if gov is not None: ann.features["governor"] = idx2annid[gov] notmatchedidx = offsetinfos[-1][1] + notmatchedidx + 1 return retdoc
def do_it(doc: Document, **kwargs): set1 = doc.annset("Set1") set1.add(2, 3, "test1", {"f1": "value1"})
def stanza2gatenlp( stanzadoc, gatenlpdoc=None, setname="", token_type="Token", sentence_type="Sentence", add_entities=True, ent_prefix=None, ): """ Convert a Stanford Stanza document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the Stanford Stanza document to it. In this case the original gatenlpdoc is used and gets modified. Args: stanzadoc: a Stanford Stanza document gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens, if needed (Default value = "Token") sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") add_entities: if True, add any entities as well (Default value = True) ent_prefix: if None, use the original entity type as annotation type, otherwise add the given string to the annotation type as a prefix. (Default value = None) Returns: the new or modified gatenlp document """ if gatenlpdoc is None: retdoc = Document(stanzadoc.text) else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.annset(setname) # stanford nlp processes text in sentence chunks, so we do everything per sentence notmatchedidx = 0 for sent in stanzadoc.sentences: # go through the tokens: in stanza, each token is a list of dicts, normally there is one dict # which also has the offset information in "misc", but for multiword tokens, there seems to be # one "header" dict for the range of words which has the offset info and NER label and then # one additional element per word which has all the rest. # For our purposes we create a list of dicts where for normal tokens we just copy the element, but for # multiword tokens we copy over something that has fake offsets and all the features newtokens = [] for t in sent.tokens: t = t.to_dict() if len(t) == 1: newtokens.append(tok2tok(t[0])) else: tokinfo = tok2tok(t[0]) words = t[1:] fm = tokinfo.get("fm") ner = fm.get("ner") text = fm.get("text") start = tokinfo["start"] end = tokinfo["end"] for i, w in enumerate(words): tok = tok2tok(w) tok["fm"]["ner"] = ner tok["fm"]["token_text"] = text os = min(start + i, end - 1) tok["start"] = os if i == len(words) - 1: tok["end"] = end else: tok["end"] = os + 1 newtokens.append(tok) # print(f"\n!!!!!!DEBUG: newtokens={newtokens}") # now go through the new token list and create annotations idx2annid = {} # map stanza word id to annotation id starts = [] ends = [] for t in newtokens: start = t["start"] end = t["end"] stanzaid = t["id"] starts.append(start) ends.append(end) annid = annset.add(start, end, token_type, features=t["fm"]).id idx2annid[str(stanzaid)] = annid # print(f"\n!!!!!!DEBUG: idx2annid={idx2annid}") # create a sentence annotation from beginning of first word to end of last sentid = annset.add(starts[0], ends[-1], sentence_type).id # now replace the head index with the corresponding annid, the head index "0" is # mapped to the sentence annotation idx2annid["0"] = sentid for annid in list(idx2annid.values()): ann = annset.get(annid) hd = ann.features.get("head") if hd is not None: hd = str(hd) headId = idx2annid.get(hd) if headId is None: logger.error( f"Could not find head id: {hd} for {ann} in document {gatenlpdoc.name}" ) else: ann.features["head"] = idx2annid[hd] # add the entities if add_entities: for e in stanzadoc.entities: if ent_prefix: anntype = ent_prefix + e.type else: anntype = e.type annset.add(e.start_char, e.end_char, anntype) return retdoc
def spacy2gatenlp( spacydoc, gatenlpdoc=None, setname="", token_type="Token", spacetoken_type="SpaceToken", sentence_type="Sentence", nounchunk_type="NounChunk", add_tokens=True, # add_spacetokens=True, # not sure how to do this yet add_ents=True, add_sents=True, add_nounchunks=True, add_dep=True, ent_prefix=None, ): """Convert a spacy document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the spacy document to it. In this case the original gatenlpdoc is used and gets modified. Args: spacydoc: a spacy document gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens (Default value = "Token") spacetoken_type: the annotation type to use for space tokens (Default value = "SpaceToken") sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk") add_tokens: should annotations for tokens get added? If not, dependency parser info cannot be added either. (Default value = True) add_ents: should annotations for entities get added add_sents: should sentence annotations get added (Default value = True) add_nounchunks: should noun chunk annotations get added (Default value = True) add_dep: should dependency parser information get added (Default value = True) # add_spacetokens: (Default value = True) # not sure how to do this yetadd_ents: (Default value = True) ent_prefix: (Default value = None) Returns: the new or modified """ if gatenlpdoc is None: retdoc = Document(spacydoc.text) else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.annset(setname) for tok in spacydoc: from_off = tok.idx to_off = tok.idx + len(tok) is_space = tok.is_space fm = { "_i": tok.i, "is_alpha": tok.is_alpha, "is_bracket": tok.is_bracket, "is_currency": tok.is_currency, "is_digit": tok.is_digit, "is_left_punct": tok.is_left_punct, "is_lower": tok.is_lower, "is_oov": tok.is_oov, "is_punct": tok.is_punct, "is_quote": tok.is_quote, "is_right_punct": tok.is_right_punct, "is_sent_start": tok.is_sent_start, "is_space": tok.is_space, "is_stop": tok.is_stop, "is_title": tok.is_title, "is_upper": tok.is_upper, "lang": tok.lang_, "lemma": tok.lemma_, "like_email": tok.like_email, "like_num": tok.like_num, "like_url": tok.like_url, "orth": tok.orth, "pos": tok.pos_, "prefix": tok.prefix_, "prob": tok.prob, "rank": tok.rank, "sentiment": tok.sentiment, "tag": tok.tag_, "shape": tok.shape_, "suffix": tok.suffix_, } if spacydoc.is_nered and add_ents: fm["ent_type"] = tok.ent_type_ if spacydoc.is_parsed and add_dep: fm["dep"] = tok.dep_ if tok.is_space: anntype = spacetoken_type else: anntype = token_type annid = annset.add(from_off, to_off, anntype, fm).id toki2annid[tok.i] = annid # print("Added annotation with id: {} for token {}".format(annid, tok.i)) ws = tok.whitespace_ if len(ws) > 0: annset.add(to_off, to_off + len(ws), spacetoken_type, {"is_space": True}) # if we have a dependency parse, now also add the parse edges if spacydoc.is_parsed and add_tokens and add_dep: for tok in spacydoc: ann = annset.get(toki2annid[tok.i]) ann.features["head"] = toki2annid[tok.head.i] ann.features["left_edge"] = toki2annid[tok.left_edge.i] ann.features["right_edge"] = toki2annid[tok.right_edge.i] if spacydoc.ents and add_ents: for ent in spacydoc.ents: if ent_prefix: entname = ent_prefix + ent.label_ else: entname = ent.label_ annset.add(ent.start_char, ent.end_char, entname, {"lemma": ent.lemma_}) if spacydoc.sents and add_sents: for sent in spacydoc.sents: annset.add(sent.start_char, sent.end_char, sentence_type, {}) if spacydoc.noun_chunks and add_nounchunks: for chunk in spacydoc.noun_chunks: annset.add(chunk.start_char, chunk.end_char, nounchunk_type, {}) return retdoc