def test_interaction01_01(self): @GateNlpPr def do_it(doc: Document, **kwargs): set1 = doc.annset("Set1") set1.add(2, 3, "test1", {"f1": "value1"}) # return nothing doc1 = Document("Just a simple document") doc1.changelog = ChangeLog() mypr = gatenlp.gate_python_plugin_pr mypr.start({"k1": "v1"}) # set the script parms mypr.execute(doc1) assert doc1._annotation_sets is not None assert len(doc1._annotation_sets) == 1 assert "Set1" in doc1._annotation_sets myset = doc1.annset("Set1") assert len(myset) == 1 myanns = myset.start_ge(0) assert len(myanns) == 1 myann = next(iter(myanns)) assert myann is not None assert myann.start == 2 assert myann.end == 3 assert myann.type == "test1" # assert myann.id == 1 assert "f1" in myann.features assert myann.features["f1"] == "value1" mychlog = doc1.changelog assert mychlog is not None assert len(mychlog) == 1 mypr.finish()
def test_interaction01_01(self): # first: use the DefaultPr mypr = _pr_decorator(DefaultPr()) doc1 = Document("Just a simple document") mypr.start({"k1": "v1"}) # set the script parms mypr.execute(doc1) mypr.finish()
def gdoc2pdoc(self, gdoc): """ Convert the GATE document to a python document and return it. Args: gdoc: the handle to a GATE document Returns: a gatenlp Document instance """ bjs = self.slave.getBdocJson(gdoc) return Document.load_mem(bjs, fmt="bdocjs")
def test01(self): doc = Document("Some test document") doc.annset().add(0, 2, "Ann") doc.annset().add(0, 1, "Ann") doc.annset().add(1, 2, "Ann") doc.annset().add(1, 2, "Token") doc.annset().add(2, 3, "Ann") annlist = list(doc.annset()) ctx = Context(doc, annlist) parser = Ann(name="a1") ret = parser.parse(Location(), ctx) assert isinstance(ret, Success) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 2 assert loc.ann_location == 1 assert len(ret[0].data) == 1 # do this with the match method ret = parser(doc, annlist) assert isinstance(ret, Success) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 2 assert loc.ann_location == 1 assert len(ret[0].data) == 1 # this does NOT first advance the annotation index so the annotation start index # is at least 2. So it matches the annotation at index 1 which ends at 1 which is # BEFORE the text index we have now. assert loc == Location(2, 1) ret = Ann(name="tmp1", useoffset=False).parse(loc, ctx) assert len(ret) == 1 loc = ret[0].location assert loc == Location(1, 2) assert len(ret[0].data) == 1 # by default we do advance, so we match the last annotation and end up at text # position 4 looking for annotation index 5 loc = Location(2, 1) ret = Ann(name="tmp1", useoffset=True).parse(loc, ctx) assert len(ret) == 1 loc = ret[0].location assert loc == Location(3, 5) assert len(ret[0].data) == 1 # Try to fail parser = Ann("Token") ret = parser(doc, annlist) assert isinstance(ret, Failure) # Same without a name: should generate the same locations, but no data parser = Ann() ret = parser.parse(Location(), ctx) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 2 assert loc.ann_location == 1 assert len(ret[0].data) == 0 ret = Ann().parse(loc, ctx) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 3 assert loc.ann_location == 5 assert len(ret[0].data) == 0 parser = AnnAt(name="a2") ret = parser.parse(Location(), ctx) assert len(ret) == 1 assert len(ret[0].data) == 1 parser = AnnAt(matchtype="all", name="a3") ret = parser.parse(Location(), ctx) assert len(ret) == 2 assert len(ret[0].data) == 1 assert len(ret[1].data) == 1 # Try Rule parser = Ann(name="a1") tmp = dict(i=0) def rhs1(succ, **kwargs): tmp["i"] = 1 rule = Call(parser, rhs1) ret = rule.parse(Location(), ctx) assert len(ret) == 1 loc = ret[0].location assert loc.text_location == 2 assert loc.ann_location == 1 assert len(ret[0].data) == 1 assert tmp["i"] == 1 # use the call method instead def rhs2(succ, **kwargs): tmp["i"] = 2 parser = Ann(name="a1").call(rhs2) ret = parser.parse(Location(), ctx) print(ret) assert tmp["i"] == 2 parser = Find(AnnAt(type="Token", name="at"), by_anns=False) ret = parser.parse(Location(), ctx) print(ret) parser = Find(AnnAt(type="Token", name="at"), by_anns=True) ret = parser.parse(Location(), ctx) print(ret) parser = Find(Text("document", name="t1"), by_anns=False) ret = parser.parse(Location(), ctx) print(ret) parser = Seq(Ann("Ann", name="a1"), Ann("Ann", name="a2"), matchtype="longest") ret = parser.parse(Location(), ctx) print(ret) parser = N(AnnAt("Ann", name="a1"), 1, 5, matchtype="first") ret = parser.parse(Location(), ctx) print(ret) parser = Or(Ann("X", name="x1"), Ann("Ann", name="a1")) ret = parser.parse(Location(), ctx) print(ret) parser = Ann("X", name="x1") | Ann("Y", name="y1") | Ann("Ann", name="a1") ret = parser.parse(Location(), ctx) print(ret) parser = Ann("Ann", name="a1") >> Ann("Ann", name="a2") ret = parser.parse(Location(), ctx) print(ret) parser = Ann("Ann", name="a1") * 2 ret = parser.parse(Location(), ctx) print(ret) parser = Ann("Ann", name="a1") * (1, 3) ret = parser.parse(Location(), ctx) print(ret)
def test05(self): # Rules and Pampac doc = Document("Some test document") doc.annset().add(0, 2, "Ann1") # 0 doc.annset().add(2, 4, "Ann2") # 1 doc.annset().add(3, 5, "Ann2") # 2 doc.annset().add(4, 5, "Ann2") # 3 doc.annset().add(8, 10, "Ann2") # 4 annset = doc.annset() orig_len = len(annset) annlist = list(doc.annset()) # first make sure the pattern works as we want ctx = Context(doc=doc, anns=annlist) pat1 = AnnAt("Ann2", name="a1") >> AnnAt("Ann2", name="a2") loc = ctx.inc_location(Location(0, 0), by_offset=1) ret = pat1.parse(location=loc, context=ctx) def r1_action(succ, context=None, **kwargs): span = succ[0].span ann = succ.context.outset.add(span.start, span.end, "NEW") return ann r1 = Rule(AnnAt("Ann2") >> AnnAt("Ann2"), r1_action) pampac = Pampac(r1) pampac.set_skip = "longest" pampac.set_select = "first" outset = doc.annset() ret = pampac.run(doc, annlist, outset=outset, debug=True) assert len(ret) == 1 assert len(ret[0]) == 2 idx, retlist = ret[0] assert idx == 1 assert len(retlist) == 1 a = retlist[0] assert isinstance(a, Annotation) assert a.start == 2 assert a.end == 5 assert a.type == "NEW" assert len(outset) == orig_len + 1
def test04(self): # Test multiple result matches with N, with and without the until clause doc = Document("Some test document") doc.annset().add(0, 2, "Ann") # 0 doc.annset().add(0, 2, "Ann") # 1 doc.annset().add(0, 2, "Token") # 2 doc.annset().add(2, 4, "Ann") # 3 doc.annset().add(2, 4, "Ann") # 4 doc.annset().add(4, 6, "Ann") # 5 doc.annset().add(4, 6, "Ann") # 6 doc.annset().add(4, 6, "Person") # 7 doc.annset().add(6, 8, "Ann") # 8 doc.annset().add(6, 8, "Ann") # 9 doc.annset().add(8, 10, "XXXX") # 10 annlist = list(doc.annset()) # multiple Anns, single result from N: first # This should find 0,3,5 ret = N( AnnAt("Ann", name="a1", matchtype="all"), min=2, max=3, select="all", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 3 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 # multiple Anns, all results from N # should return 0,1 ret = N( AnnAt("Ann", name="a1", matchtype="all"), min=1, max=1, select="all", matchtype="all", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 2 assert len(ret[0].data) == 1 assert len(ret[1].data) == 1 assert ret[0].data[0]["ann"].id == 0 assert ret[1].data[0]["ann"].id == 1 # multiple Anns, all results from N ret = N( AnnAt("Ann", name="a1", matchtype="all"), min=1, max=2, select="all", matchtype="all", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 4 assert len(ret[0].data) == 2 assert len(ret[1].data) == 2 assert len(ret[2].data) == 2 assert len(ret[3].data) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[1].data[0]["ann"].id == 0 assert ret[1].data[1]["ann"].id == 4 assert ret[2].data[0]["ann"].id == 1 assert ret[2].data[1]["ann"].id == 3 assert ret[3].data[0]["ann"].id == 1 assert ret[3].data[1]["ann"].id == 4 # multiple Anns, all results from N # just three for the first ann: 0,1,2 ret = N( AnnAt(name="a1", matchtype="all"), min=1, max=1, select="all", matchtype="all", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 3 assert len(ret[0].data) == 1 assert len(ret[1].data) == 1 assert len(ret[2].data) == 1 assert ret[0].data[0]["ann"].id == 0 assert ret[1].data[0]["ann"].id == 1 assert ret[2].data[0]["ann"].id == 2 # This should just find the Token as the first and only match! ret = N(AnnAt("Ann", name="a1", matchtype="all"), until=AnnAt("Token", name="t", matchtype="first"), min=0, max=3, select="all", matchtype="all").match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 1 assert ret[0].data[0]["ann"].id == 2 # This should terminate with Person and find all paths that can lead up to PErson: # 0,3 0,4 1,3 1,4 ret = N(AnnAt("Ann", name="a1", matchtype="all"), until=AnnAt("Person", name="t", matchtype="first"), min=1, max=3, select="all", matchtype="all").match(doc, annlist) assert ret.issuccess() assert len(ret) == 4 assert len(ret[0].data) == 3 assert len(ret[1].data) == 3 assert len(ret[2].data) == 3 assert len(ret[3].data) == 3 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 7 assert ret[1].data[0]["ann"].id == 0 assert ret[1].data[1]["ann"].id == 4 assert ret[1].data[2]["ann"].id == 7 assert ret[2].data[0]["ann"].id == 1 assert ret[2].data[1]["ann"].id == 3 assert ret[2].data[2]["ann"].id == 7 assert ret[3].data[0]["ann"].id == 1 assert ret[3].data[1]["ann"].id == 4 assert ret[3].data[2]["ann"].id == 7
def test03(self): # Test single result matches with N, with and without the until clause doc = Document("Some test document") doc.annset().add(0, 2, "Ann") # 0 doc.annset().add(0, 2, "Ann") # 1 doc.annset().add(0, 2, "Token") # 2 doc.annset().add(2, 4, "Ann") # 3 doc.annset().add(2, 4, "Ann") # 4 doc.annset().add(4, 6, "Ann") # 5 doc.annset().add(4, 6, "Ann") # 6 doc.annset().add(4, 6, "Person") # 7 doc.annset().add(6, 8, "Ann") # 8 doc.annset().add(6, 8, "Ann") # 9 doc.annset().add(8, 10, "XXXX") # 10 annlist = list(doc.annset()) # single Ann, single result from N # this should return annotation ids 0, 3, 5 ret = N( AnnAt("Ann", name="a1", matchtype="first"), min=2, max=3, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 3 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 # Same as before, but with a name, so we should get one additional data for the whole sequence # with a span ret = N(AnnAt("Ann", name="a1", matchtype="first"), min=2, max=3, select="first", matchtype="first", name="n1").match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 4 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 assert ret[0].data[3]["span"] == Span(0, 6) # single Ann, single result from N # this should return annotation ids 0, 3, 5, 8 ret = N( AnnAt("Ann", name="a1", matchtype="first"), min=2, max=99, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 4 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 assert ret[0].data[3]["ann"].id == 8 # single Ann, single result from N, with early stopping at Person # this should return annotation ids 0, 3, 7 ret = N( AnnAt("Ann", name="a1", matchtype="first"), until=AnnAt("Person", name="p"), min=2, max=99, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 3 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 7 # Try a match with min=0 and max=99 that does not succeed # single Ann, single result from N # this should return an empty list for data ret = N( AnnAt("NotThere", name="a1", matchtype="first"), min=0, max=99, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 0 # Try a match with min=0 and max=99 that does not succeed # single Ann, single result from N # this should return an empty list for data ret = N( AnnAt("Ann", name="a1", matchtype="first"), min=0, max=99, select="first", matchtype="first", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 4 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[0].data[2]["ann"].id == 5 assert ret[0].data[3]["ann"].id == 8
def test02(self): # Test multiple result matches doc = Document("Some test document") doc.annset().add(0, 2, "Ann") # 0 doc.annset().add(0, 2, "Ann") # 1 doc.annset().add(0, 2, "Token") # 2 doc.annset().add(2, 4, "Ann") # 3 doc.annset().add(2, 4, "Ann") # 4 annlist = list(doc.annset()) # match all annotations at the document start ret = AnnAt(matchtype="all").match(doc, annlist) assert ret.issuccess() assert len(ret) == 3 # match sequence Token/Ann, take first at each point # this should match annotation ids 2 and 3 ret = Seq(AnnAt("Token", name="1"), AnnAt("Ann", name="2")).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 2 assert ret[0].data[0]["ann"].id == 2 assert ret[0].data[1]["ann"].id == 3 # match sequence Ann/Ann, take first at each point ret = Seq(AnnAt("Ann", name="1"), AnnAt("Ann", name="2")).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 # match sequence Ann/Ann, take first at each point, set useoffset=False so we do not skip to the # end offset of the previous before matching the next # In that case the next ann we match is the second one at offset 0 ret = Seq(AnnAt("Ann", name="1"), AnnAt("Ann", name="2", useoffset=False)).match(doc, annlist) assert ret.issuccess() assert len(ret) == 1 assert len(ret[0].data) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 1 # Make sure we get the correct set of annotations at position 0 and 2 ret = AnnAt("Ann", name="a", matchtype="all").match(doc, annlist) assert ret.issuccess() assert len(ret) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[1].data[0]["ann"].id == 1 # ret.pprint() ret = AnnAt("Ann", name="a", matchtype="all").match(doc, annlist, location=Location(2, 2)) assert ret.issuccess() assert len(ret) == 2 assert ret[0].data[0]["ann"].id == 3 assert ret[1].data[0]["ann"].id == 4 # ret.pprint() # Match sequence of two anns in order, take all results ret = Seq( AnnAt("Ann", name="1", matchtype="all"), AnnAt("Ann", name="2", matchtype="all"), select="all", matchtype="all", ).match(doc, annlist) assert ret.issuccess() assert len(ret) == 4 assert len(ret[0].data) == 2 assert len(ret[1].data) == 2 assert len(ret[2].data) == 2 assert len(ret[3].data) == 2 assert ret[0].data[0]["ann"].id == 0 assert ret[0].data[1]["ann"].id == 3 assert ret[1].data[0]["ann"].id == 0 assert ret[1].data[1]["ann"].id == 4 assert ret[2].data[0]["ann"].id == 1 assert ret[2].data[1]["ann"].id == 3 assert ret[3].data[0]["ann"].id == 1 assert ret[3].data[1]["ann"].id == 4
def stanfordnlp2gatenlp( stanfordnlpdoc, gatenlpdoc=None, setname="", word_type="Word", sentence_type="Sentence", ): """Convert a StanfordNLP document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the StanfordNLP document to it. In this case the original gatenlpdoc is used and gets modified. Args: stanfordnlpdoc: a StanfordNLP document gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") word_type: (Default value = "Word") Returns: the new or modified """ if gatenlpdoc is None: retdoc = Document(stanfordnlpdoc.text) else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.annset(setname) # stanford nlp processes text in sentence chunks, so we do everything per sentence # NOTE: the stanford elements do not contain any text offsets, so we have to match and find # them ourselves. for this we keep an index to first character in the text which has not # been matched yet notmatchedidx = 0 for sent in stanfordnlpdoc.sentences: # a sentence is a list of tokens and a list of words. Some tokens consist of several words. # dependency parsers are over words, so we create Word and Token annotations, but we only # set the features per Word annotation for now. offsetinfos = utils.match_substrings( stanfordnlpdoc.text[notmatchedidx:], sent.words, getstr=lambda x: x.text) idx2annid = {} for oinfo in offsetinfos: word = oinfo[2] fm = { "string": word.text, "lemma": word.lemma, "upos": word.upos, "xpos": word.xpos, "dependency_relation": word.dependency_relation, "governor": int(word.governor), } for feat in word.feats.split("|"): if feat and feat != "_": k, v = feat.split("=") # TODO: maybe try to detect and convert bool/int values fm["feat_" + k] = v snlp_idx = int(word.index) annid = annset.add(oinfo[0] + notmatchedidx, oinfo[1] + notmatchedidx, word_type, fm).id idx2annid[snlp_idx] = annid # create a sentence annotation from beginning of first word to end of last sentid = annset.add( offsetinfos[0][0] + notmatchedidx, offsetinfos[-1][1] + notmatchedidx, sentence_type, ).id # now replace the governor index with the corresponding annid, the governor index is # mapped to the sentence annotation idx2annid[0] = sentid for annid in list(idx2annid.values()): ann = annset.get(annid) gov = ann.features.get("governor") if gov is not None: ann.features["governor"] = idx2annid[gov] notmatchedidx = offsetinfos[-1][1] + notmatchedidx + 1 return retdoc
def do_it(doc: Document, **kwargs): set1 = doc.annset("Set1") set1.add(2, 3, "test1", {"f1": "value1"})
def stanza2gatenlp( stanzadoc, gatenlpdoc=None, setname="", token_type="Token", sentence_type="Sentence", add_entities=True, ent_prefix=None, ): """ Convert a Stanford Stanza document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the Stanford Stanza document to it. In this case the original gatenlpdoc is used and gets modified. Args: stanzadoc: a Stanford Stanza document gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens, if needed (Default value = "Token") sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") add_entities: if True, add any entities as well (Default value = True) ent_prefix: if None, use the original entity type as annotation type, otherwise add the given string to the annotation type as a prefix. (Default value = None) Returns: the new or modified gatenlp document """ if gatenlpdoc is None: retdoc = Document(stanzadoc.text) else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.annset(setname) # stanford nlp processes text in sentence chunks, so we do everything per sentence notmatchedidx = 0 for sent in stanzadoc.sentences: # go through the tokens: in stanza, each token is a list of dicts, normally there is one dict # which also has the offset information in "misc", but for multiword tokens, there seems to be # one "header" dict for the range of words which has the offset info and NER label and then # one additional element per word which has all the rest. # For our purposes we create a list of dicts where for normal tokens we just copy the element, but for # multiword tokens we copy over something that has fake offsets and all the features newtokens = [] for t in sent.tokens: t = t.to_dict() if len(t) == 1: newtokens.append(tok2tok(t[0])) else: tokinfo = tok2tok(t[0]) words = t[1:] fm = tokinfo.get("fm") ner = fm.get("ner") text = fm.get("text") start = tokinfo["start"] end = tokinfo["end"] for i, w in enumerate(words): tok = tok2tok(w) tok["fm"]["ner"] = ner tok["fm"]["token_text"] = text os = min(start + i, end - 1) tok["start"] = os if i == len(words) - 1: tok["end"] = end else: tok["end"] = os + 1 newtokens.append(tok) # print(f"\n!!!!!!DEBUG: newtokens={newtokens}") # now go through the new token list and create annotations idx2annid = {} # map stanza word id to annotation id starts = [] ends = [] for t in newtokens: start = t["start"] end = t["end"] stanzaid = t["id"] starts.append(start) ends.append(end) annid = annset.add(start, end, token_type, features=t["fm"]).id idx2annid[str(stanzaid)] = annid # print(f"\n!!!!!!DEBUG: idx2annid={idx2annid}") # create a sentence annotation from beginning of first word to end of last sentid = annset.add(starts[0], ends[-1], sentence_type).id # now replace the head index with the corresponding annid, the head index "0" is # mapped to the sentence annotation idx2annid["0"] = sentid for annid in list(idx2annid.values()): ann = annset.get(annid) hd = ann.features.get("head") if hd is not None: hd = str(hd) headId = idx2annid.get(hd) if headId is None: logger.error( f"Could not find head id: {hd} for {ann} in document {gatenlpdoc.name}" ) else: ann.features["head"] = idx2annid[hd] # add the entities if add_entities: for e in stanzadoc.entities: if ent_prefix: anntype = ent_prefix + e.type else: anntype = e.type annset.add(e.start_char, e.end_char, anntype) return retdoc
def spacy2gatenlp( spacydoc, gatenlpdoc=None, setname="", token_type="Token", spacetoken_type="SpaceToken", sentence_type="Sentence", nounchunk_type="NounChunk", add_tokens=True, # add_spacetokens=True, # not sure how to do this yet add_ents=True, add_sents=True, add_nounchunks=True, add_dep=True, ent_prefix=None, ): """Convert a spacy document to a gatenlp document. If a gatenlp document is already provided, add the annotations from the spacy document to it. In this case the original gatenlpdoc is used and gets modified. Args: spacydoc: a spacy document gatenlpdoc: if None, a new gatenlp document is created otherwise this document is added to. (Default value = None) setname: the annotation set name to which the annotations get added, empty string for the default annotation set. token_type: the annotation type to use for tokens (Default value = "Token") spacetoken_type: the annotation type to use for space tokens (Default value = "SpaceToken") sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence") nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk") add_tokens: should annotations for tokens get added? If not, dependency parser info cannot be added either. (Default value = True) add_ents: should annotations for entities get added add_sents: should sentence annotations get added (Default value = True) add_nounchunks: should noun chunk annotations get added (Default value = True) add_dep: should dependency parser information get added (Default value = True) # add_spacetokens: (Default value = True) # not sure how to do this yetadd_ents: (Default value = True) ent_prefix: (Default value = None) Returns: the new or modified """ if gatenlpdoc is None: retdoc = Document(spacydoc.text) else: retdoc = gatenlpdoc toki2annid = {} annset = retdoc.annset(setname) for tok in spacydoc: from_off = tok.idx to_off = tok.idx + len(tok) is_space = tok.is_space fm = { "_i": tok.i, "is_alpha": tok.is_alpha, "is_bracket": tok.is_bracket, "is_currency": tok.is_currency, "is_digit": tok.is_digit, "is_left_punct": tok.is_left_punct, "is_lower": tok.is_lower, "is_oov": tok.is_oov, "is_punct": tok.is_punct, "is_quote": tok.is_quote, "is_right_punct": tok.is_right_punct, "is_sent_start": tok.is_sent_start, "is_space": tok.is_space, "is_stop": tok.is_stop, "is_title": tok.is_title, "is_upper": tok.is_upper, "lang": tok.lang_, "lemma": tok.lemma_, "like_email": tok.like_email, "like_num": tok.like_num, "like_url": tok.like_url, "orth": tok.orth, "pos": tok.pos_, "prefix": tok.prefix_, "prob": tok.prob, "rank": tok.rank, "sentiment": tok.sentiment, "tag": tok.tag_, "shape": tok.shape_, "suffix": tok.suffix_, } if spacydoc.is_nered and add_ents: fm["ent_type"] = tok.ent_type_ if spacydoc.is_parsed and add_dep: fm["dep"] = tok.dep_ if tok.is_space: anntype = spacetoken_type else: anntype = token_type annid = annset.add(from_off, to_off, anntype, fm).id toki2annid[tok.i] = annid # print("Added annotation with id: {} for token {}".format(annid, tok.i)) ws = tok.whitespace_ if len(ws) > 0: annset.add(to_off, to_off + len(ws), spacetoken_type, {"is_space": True}) # if we have a dependency parse, now also add the parse edges if spacydoc.is_parsed and add_tokens and add_dep: for tok in spacydoc: ann = annset.get(toki2annid[tok.i]) ann.features["head"] = toki2annid[tok.head.i] ann.features["left_edge"] = toki2annid[tok.left_edge.i] ann.features["right_edge"] = toki2annid[tok.right_edge.i] if spacydoc.ents and add_ents: for ent in spacydoc.ents: if ent_prefix: entname = ent_prefix + ent.label_ else: entname = ent.label_ annset.add(ent.start_char, ent.end_char, entname, {"lemma": ent.lemma_}) if spacydoc.sents and add_sents: for sent in spacydoc.sents: annset.add(sent.start_char, sent.end_char, sentence_type, {}) if spacydoc.noun_chunks and add_nounchunks: for chunk in spacydoc.noun_chunks: annset.add(chunk.start_char, chunk.end_char, nounchunk_type, {}) return retdoc
if not os.path.exists(args.indir): raise Exception("Does not exist: {}".format(args.indir)) if not os.path.exists(args.outdir): raise Exception("Does not exist: {}".format(args.outdir)) gen = Path(args.indir).rglob("*.bdocjs") total_readorig = 0 total_save = 0 total_read = 0 newfiles = [] for f in gen: relpath = str(f) start = time.time() doc = Document.load(relpath, fmt=args.infmt) total_readorig += time.time() - start relpath = relpath.replace(os.path.sep, "_") relpath = relpath.replace(".bdocjs", args.fmt) newfile = os.path.join(args.outdir, relpath) newfiles.append(newfile) start = time.time() doc.save(newfile, fmt=args.fmt) total_save += time.time() - start for f in newfiles: start = time.time() doc = Document.load(f, fmt=args.fmt) total_read += time.time() - start n = len(newfiles)
def annotate(anno_list, patient_id, start_date, num_of_day, whether_print=True, anno_specific=False): input_for_anno_path = 'Clinical_Note/' + str( patient_id) + '/output_no_sid/' + str(start_date) + '_o.txt' # Load document and create a new anno set doc = Document.load(input_for_anno_path) annset_target = doc.annset() def find_pos(target_string): if target_string == "V/S": f = open(input_for_anno_path, "r") tmp_cnt = f.read() if tmp_cnt.find("V/S") is not -1: start_pos = tmp_cnt.find("V/S") + 4 end_pos = tmp_cnt.find("\n", start_pos) return start_pos, end_pos else: return 0, 0 elif target_string == "Vital signs": f = open(input_for_anno_path, "r") tmp_cnt = f.read() if tmp_cnt.find("Vital signs") is not -1: start_pos = tmp_cnt.find("Vital signs") + 14 end_pos = tmp_cnt.find("\n", start_pos) return start_pos, end_pos else: return 0, 0 elif target_string == "VS Comment": f = open(input_for_anno_path, "r") tmp_cnt = f.read() if tmp_cnt.find("VS Comment") is not -1 or tmp_cnt.find( "VS comment") is not -1: if tmp_cnt.find("VS Comment") is not -1: start_pos = tmp_cnt.find("VS Comment") + 10 end_pos = len(tmp_cnt) elif tmp_cnt.find("VS comment") is not -1: start_pos = tmp_cnt.find("VS comment") + 10 end_pos = len(tmp_cnt) return start_pos, end_pos else: return 0, 0 elif target_string == "P": # 先判斷第一行有沒有以S開頭 f1 = open(input_for_anno_path, "r") f2 = open(input_for_anno_path, "r") exam_cnt = f1.readline() tmp_cnt = f2.read() if exam_cnt.find("S:") is not -1: if tmp_cnt.find("P:") is not -1: start_pos = tmp_cnt.find("P:") + 2 # sen_splir = tmp_cmt[tmp_cmt.find("P "):len(tmp_cmt)] end_pos = len(tmp_cnt) return start_pos, end_pos else: return 0, 0 elif (tmp_cnt.find("Plan") is not -1) and (tmp_cnt.find("VS comment") is -1 and tmp_cnt.find("VS Comment") is -1): start_pos = tmp_cnt.find("Plan") + 4 end_pos = len(tmp_cnt) return start_pos, end_pos else: return 0, 0 else: return 0, 0 # 把位置計算方式轉換 def rowcol_to_position(line, start_pos, length): f = open(input_for_anno_path, "r") cur_pos = 0 # cur_pos = 7 cur_line = 1 tmp_content = f.readline() # 一直執行迴圈,讀入新行 while True: if tmp_content: if (line == 1): break elif (cur_line < line): cur_pos += len(tmp_content) cur_line += 1 tmp_content = f.readline() else: break else: break start_pos = start_pos + cur_pos end_pos = start_pos + length return start_pos, end_pos #------end of rowcol 2 position------ # 把anno list 中的元素都標註完 while anno_list: # 判斷是否只有一個span if anno_list[0][1].find(';') is not -1: # anno_list[0][1].replace(';', ',') pos_start1 = int(anno_list[0][1].split(';')[0].split('/')[0]) len1 = int(anno_list[0][1].split(';')[0].split('/')[1]) start_pos1, end_pos1 = rowcol_to_position(int(anno_list[0][0]), pos_start1, len1) annset_target.add(start_pos1, end_pos1, "New Info") pos_start2 = int(anno_list[0][1].split(';')[1].split('/')[0]) len2 = int(anno_list[0][1].split(';')[1].split('/')[1]) start_pos2, end_pos2 = rowcol_to_position(int(anno_list[0][0]), pos_start2, len2) annset_target.add(start_pos2, end_pos2, "New Info") elif anno_list[0][1].find(',') is -1: # 行 = int(anno_list[0][0]) # 頭位置 = int(anno_list[0][1].split('/')[0]) # vary 長度 = int(anno_list[0][1].split('/')[1]) start_pos, end_pos = rowcol_to_position( int(anno_list[0][0]), int(anno_list[0][1].split('/')[0]), int(anno_list[0][1].split('/')[1])) annset_target.add(start_pos, end_pos, "New Info") elif (anno_list[0][1].find(',') is not -1) and (anno_list[0][1].find('[') is not -1): info_process = anno_list[0][1].replace('[', '', 2).replace(']', '', 2) pos_start1 = int(info_process.split(',')[0].split('/')[0]) len1 = int(info_process.split(',')[0].split('/')[1]) start_pos1, end_pos1 = rowcol_to_position(int(anno_list[0][0]), pos_start1, len1) annset_target.add(start_pos1, end_pos1, "New Info") pos_start2 = int(info_process.split(',')[1].split('/')[0]) len2 = int(info_process.split(',')[1].split('/')[1]) start_pos2, end_pos2 = rowcol_to_position(int(anno_list[0][0]), pos_start2, len2) annset_target.add(start_pos2, end_pos2, "New Info") else: #(anno_list[0][1].find(',') is not -1): pos_start1 = int(anno_list[0][1].split(',')[0].split('/')[0]) len1 = int(anno_list[0][1].split(',')[0].split('/')[1]) start_pos1, end_pos1 = rowcol_to_position(int(anno_list[0][0]), pos_start1, len1) annset_target.add(start_pos1, end_pos1, "New Info") pos_start2 = int(anno_list[0][1].split(',')[1].split('/')[0]) len2 = int(anno_list[0][1].split(',')[1].split('/')[1]) start_pos2, end_pos2 = rowcol_to_position(int(anno_list[0][0]), pos_start2, len2) annset_target.add(start_pos2, end_pos2, "New Info") del anno_list[0] if anno_specific: s, e = find_pos("P") if (s is not 0) or (e is not 0): annset_target.add(s, e, "New Info") s, e = find_pos("VS Comment") if (s is not 0) or (e is not 0): annset_target.add(s, e, "New Info") s, e = find_pos("V/S") if (s is not 0) or (e is not 0): annset_target.add(s, e, "New Info") s, e = find_pos("Vital signs") if (s is not 0) or (e is not 0): annset_target.add(s, e, "New Info") if whether_print: print("標注結果為:", annset_target) save_path = 'Clinical_Note/' + str(patient_id) + '/annotated/' + str( start_date) + '_' + str(num_of_day) + '.bdocjs' doc.save(save_path) print("annotate complete!" + str(patient_id), str(start_date), str(num_of_day))
def validate( patient_id, start_date, num_of_day, ): input_path = 'Clinical_Note/' + str(patient_id) + '/annotated/' + str( start_date) + '_' + str(num_of_day) + '.bdocjs' # 這邊先用絕對路徑之後會改 # 黃金準則 # doc_gold = Document.load('/home/feng/'+str(start_date)+'_1_o.xml', fmt = "gatexml") doc_gold = Document.load('Gold_Standard/' + str(patient_id) + '/' + str(start_date) + '_o.xml', fmt="gatexml") annset_gold = doc_gold.annset('').with_type('New Info') annset_whole_tk = doc_gold.annset('Token').with_type('Token') # mm 全標 doc_mm = Document.load('Clinical_Note/' + str(patient_id) + '/annotated/' + str(start_date) + '_0' + '.bdocjs') annset_mm = doc_mm.annset('') annset_mm_mapping_tk = doc_mm.annset("mmtk") tmp3 = annset_mm.copy() # --------------annset_mm 前處理---------- annset_mmanno_dup = doc_mm.annset("duplicate_mmanno") while annset_mm.size: annset_mmanno_first = annset_mm.first() annset_mm.remove(annset_mmanno_first) for i in annset_mm: while annset_mmanno_first.iswithin( i) or annset_mmanno_first.iscovering( i) or annset_mmanno_first.iscoextensive(i): annset_mmanno_dup.add_ann(i, i.id) annset_mm.remove(i) break break annset_mm = tmp3.copy() annset_mm.remove(annset_mmanno_dup) # ---------------------------------- annset_mm_tk = 0 for i in annset_mm: for j in annset_whole_tk: if i.iscovering(j) or i.iswithin(j): annset_mm_mapping_tk.add_ann(j) annset_mm_tk += 1 tmp = annset_mm_mapping_tk.copy() # ------------annset_mm_mapping_tk 前處理--------------- mm_duplicate = 0 annset_mm_dup = doc_mm.annset("duplicate_mmtk") while annset_mm_mapping_tk.size: annset_mmtki_first = annset_mm_mapping_tk.first() annset_mm_mapping_tk.remove(annset_mmtki_first) for i in annset_mm_mapping_tk: while annset_mmtki_first.iswithin( i) or annset_mmtki_first.iscovering( i) or annset_mmtki_first.iscoextensive(i): annset_mm_dup.add_ann(i, i.id) annset_mm_mapping_tk.remove(i) break break annset_mm_mapping_tk = tmp.copy() annset_mm_mapping_tk.remove(annset_mm_dup) # ---------------------------------- # 系統標注的 doc_target = Document.load(input_path) # 系統標注的 annotation set annset_target = doc_target.annset('') tmp2 = annset_target.copy() # ----------annset_target 前處理------------- annset_tgano_dup = doc_target.annset("duplicate_tg_anno") while annset_target.size: annset_tg_first = annset_target.first() annset_target.remove(annset_tg_first) for i in annset_target: while annset_tg_first.iswithin(i) or annset_tg_first.iscovering( i) or annset_tg_first.iscoextensive(i): annset_tgano_dup.add_ann(i, i.id) annset_target.remove(i) break break annset_target = tmp2.copy() annset_target.remove(annset_tgano_dup) # ---------------------------------- # 文本共幾個 token text_all_tk = annset_whole_tk.size # 系統標注的 annotation set 透過 mm 轉成 token # 算出共標注幾個 tk (mm base) annset_target_tk = 0 annset_target_tk2 = doc_target.annset("tgtk") for ann_t in annset_target: for ann_m in annset_mm_mapping_tk: if ann_t.iscovering(ann_m) or ann_t.iswithin(ann_m): annset_target_tk2.add_ann(ann_m) annset_target_tk += 1 tmp4 = annset_target_tk2.copy() # annset_tg_tk2_orgset2 = annset_target_tk2.copy() # ----------annset_target_tk2 前處理------------------ target_duplicate = 0 annset_tg_dup = doc_target.annset("duplicate_tg") while annset_target_tk2.size: annset_tgtk2_first = annset_target_tk2.first() annset_target_tk2.remove(annset_tgtk2_first) for i in annset_target_tk2: while annset_tgtk2_first.iswithin( i) or annset_tgtk2_first.iscovering( i) or annset_tgtk2_first.iscoextensive(i): annset_tg_dup.add_ann(i, i.id) annset_target_tk2.remove(i) break break annset_target_tk2 = tmp4.copy() annset_target_tk2.remove(annset_tg_dup) # ---------------------------------- # 黃金準則標注的 annotation set 轉成 token # annset_gold_tk = 0 # for i in annset_gold: # for j in annset_tk: # if i.iscovering(j): # annset_gold_tk += 1 annset_gold_tk = 0 annset_gold_tk2 = doc_gold.annset("goldtk") for i in annset_gold: for j in annset_mm_mapping_tk: if i.iscovering(j) or i.iswithin(j): annset_gold_tk2.add_ann(j.copy()) annset_gold_tk += 1 # print("annset_gold_tk", annset_gold_tk) annset_score = doc_target.annset("score") score = 0 # 計分區 for i in annset_target_tk2: for j in annset_gold_tk2: if i.iscoextensive(j): annset_score.add_ann(i.copy()) score += 1 score_tk = 0 for i in annset_score: for j in annset_mm_mapping_tk: if i.iscovering(j) or i.iswithin(j): score_tk += 1 # ----------annset_score 前處理------------------ score_duplicate = 0 tmp5 = annset_score.copy() annset_sc_dup = doc_target.annset("duplicate") annset_score_original = annset_score.size #共有幾個(含重複) while annset_score.size: annset_score_first = annset_score.first() annset_score.remove(annset_score_first) for i in annset_score: while annset_score_first.iswithin( i) or annset_score_first.iscovering( i) or annset_score_first.iscoextensive(i): annset_sc_dup.add_ann(i, i.id) annset_score.remove(i) break break annset_score = tmp5.copy() annset_score.remove(annset_sc_dup) # annset_score_cali = annset_score_original - annset_sc_dup.size # ---------------------------------- print( "黃金準則token(對照mm)", annset_gold_tk2.size, "系統標註筆數(對照mm 不重複)", annset_target_tk2.size - math.sqrt(annset_target_tk2.size), "標註得分token", annset_score_original, "校正後得分:", annset_score.size, ) if annset_gold_tk is not 0 and annset_target_tk2.size is not 0: print( "Precision:", round( (annset_score.size / (annset_target_tk2.size - math.sqrt(annset_target_tk2.size)) * 100), 2), "Recall:", round((annset_score.size / annset_gold_tk2.size * 100), 2)) # 寫入報告 f = open("Clinical_Note/Result/" + str(patient_id) + '_result.csv', 'a') str_in = str(start_date) + "," + str(num_of_day) f.writelines(str_in) if round( (annset_score_original * 2 / (annset_target_tk2.size - math.sqrt(annset_target_tk2.size)) * 100), 2) > 100: res_p = 100 else: res_p = round( (annset_score_original * 2 / (annset_target_tk2.size - math.sqrt(annset_target_tk2.size)) * 100), 2) if round((annset_score.size / annset_gold_tk2.size * 100), 2) > 100: res_r = 100 else: res_r = round((annset_score.size / annset_gold_tk2.size * 100), 2) str_in = "," + str(res_p) + "," + str(res_r) + '\n' f.writelines(str_in)
#!/usr/bin/env python """ Simple demo implementation for generating HTML Viewer for a bdoc document """ import sys import os import argparse from gatenlp import Document if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("infile", help="input bdoc document") parser.add_argument("outfile", help="output html file") parser.add_argument("--offline", action="store_true", help="Generate for offline use") parser.add_argument( "--notebook", action="store_true", help="Generate for HTML embedding in notebook", ) args = parser.parse_args() doc = Document.load(args.infile, fmt="json") html = doc.save_mem(fmt="html-ann-viewer", offline=args.offline, notebook=args.notebook) with open(args.outfile, "wt", encoding="utf-8") as outfp: outfp.write(html)