Esempio n. 1
0
    def test_interaction01_01(self):
        @GateNlpPr
        def do_it(doc: Document, **kwargs):
            set1 = doc.annset("Set1")
            set1.add(2, 3, "test1", {"f1": "value1"})
            # return nothing

        doc1 = Document("Just a simple document")
        doc1.changelog = ChangeLog()
        mypr = gatenlp.gate_python_plugin_pr
        mypr.start({"k1": "v1"})  # set the script parms
        mypr.execute(doc1)
        assert doc1._annotation_sets is not None
        assert len(doc1._annotation_sets) == 1
        assert "Set1" in doc1._annotation_sets
        myset = doc1.annset("Set1")
        assert len(myset) == 1
        myanns = myset.start_ge(0)
        assert len(myanns) == 1
        myann = next(iter(myanns))
        assert myann is not None
        assert myann.start == 2
        assert myann.end == 3
        assert myann.type == "test1"
        # assert myann.id == 1
        assert "f1" in myann.features
        assert myann.features["f1"] == "value1"
        mychlog = doc1.changelog
        assert mychlog is not None
        assert len(mychlog) == 1
        mypr.finish()
Esempio n. 2
0
 def test_interaction01_01(self):
     # first: use the DefaultPr
     mypr = _pr_decorator(DefaultPr())
     doc1 = Document("Just a simple document")
     mypr.start({"k1": "v1"})  # set the script parms
     mypr.execute(doc1)
     mypr.finish()
Esempio n. 3
0
    def gdoc2pdoc(self, gdoc):
        """
        Convert the GATE document to a python document and return it.

        Args:
          gdoc: the handle to a GATE document

        Returns:
          a gatenlp Document instance
        """
        bjs = self.slave.getBdocJson(gdoc)
        return Document.load_mem(bjs, fmt="bdocjs")
Esempio n. 4
0
    def test01(self):
        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann")
        doc.annset().add(0, 1, "Ann")
        doc.annset().add(1, 2, "Ann")
        doc.annset().add(1, 2, "Token")
        doc.annset().add(2, 3, "Ann")
        annlist = list(doc.annset())

        ctx = Context(doc, annlist)
        parser = Ann(name="a1")
        ret = parser.parse(Location(), ctx)
        assert isinstance(ret, Success)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 2
        assert loc.ann_location == 1
        assert len(ret[0].data) == 1

        # do this with the match method
        ret = parser(doc, annlist)
        assert isinstance(ret, Success)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 2
        assert loc.ann_location == 1
        assert len(ret[0].data) == 1

        # this does NOT first advance the annotation index so the annotation start index
        # is at least 2. So it matches the annotation at index 1 which ends at 1 which is
        # BEFORE the text index we have now.
        assert loc == Location(2, 1)
        ret = Ann(name="tmp1", useoffset=False).parse(loc, ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc == Location(1, 2)
        assert len(ret[0].data) == 1

        # by default we do advance, so we match the last annotation and end up at text
        # position 4 looking for annotation index 5
        loc = Location(2, 1)
        ret = Ann(name="tmp1", useoffset=True).parse(loc, ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc == Location(3, 5)
        assert len(ret[0].data) == 1

        # Try to fail
        parser = Ann("Token")
        ret = parser(doc, annlist)
        assert isinstance(ret, Failure)

        # Same without a name: should generate the same locations, but no data
        parser = Ann()
        ret = parser.parse(Location(), ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 2
        assert loc.ann_location == 1
        assert len(ret[0].data) == 0

        ret = Ann().parse(loc, ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 3
        assert loc.ann_location == 5
        assert len(ret[0].data) == 0

        parser = AnnAt(name="a2")
        ret = parser.parse(Location(), ctx)
        assert len(ret) == 1
        assert len(ret[0].data) == 1

        parser = AnnAt(matchtype="all", name="a3")
        ret = parser.parse(Location(), ctx)
        assert len(ret) == 2
        assert len(ret[0].data) == 1
        assert len(ret[1].data) == 1

        # Try Rule
        parser = Ann(name="a1")
        tmp = dict(i=0)

        def rhs1(succ, **kwargs):
            tmp["i"] = 1

        rule = Call(parser, rhs1)
        ret = rule.parse(Location(), ctx)
        assert len(ret) == 1
        loc = ret[0].location
        assert loc.text_location == 2
        assert loc.ann_location == 1
        assert len(ret[0].data) == 1
        assert tmp["i"] == 1

        # use the call method instead
        def rhs2(succ, **kwargs):
            tmp["i"] = 2

        parser = Ann(name="a1").call(rhs2)
        ret = parser.parse(Location(), ctx)
        print(ret)
        assert tmp["i"] == 2

        parser = Find(AnnAt(type="Token", name="at"), by_anns=False)
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Find(AnnAt(type="Token", name="at"), by_anns=True)
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Find(Text("document", name="t1"), by_anns=False)
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Seq(Ann("Ann", name="a1"),
                     Ann("Ann", name="a2"),
                     matchtype="longest")
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = N(AnnAt("Ann", name="a1"), 1, 5, matchtype="first")
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Or(Ann("X", name="x1"), Ann("Ann", name="a1"))
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Ann("X", name="x1") | Ann("Y", name="y1") | Ann("Ann",
                                                                 name="a1")
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Ann("Ann", name="a1") >> Ann("Ann", name="a2")
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Ann("Ann", name="a1") * 2
        ret = parser.parse(Location(), ctx)
        print(ret)

        parser = Ann("Ann", name="a1") * (1, 3)
        ret = parser.parse(Location(), ctx)
        print(ret)
Esempio n. 5
0
    def test05(self):
        # Rules and Pampac

        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann1")  # 0
        doc.annset().add(2, 4, "Ann2")  # 1
        doc.annset().add(3, 5, "Ann2")  # 2
        doc.annset().add(4, 5, "Ann2")  # 3
        doc.annset().add(8, 10, "Ann2")  # 4
        annset = doc.annset()
        orig_len = len(annset)
        annlist = list(doc.annset())

        # first make sure the pattern works as we want
        ctx = Context(doc=doc, anns=annlist)
        pat1 = AnnAt("Ann2", name="a1") >> AnnAt("Ann2", name="a2")
        loc = ctx.inc_location(Location(0, 0), by_offset=1)
        ret = pat1.parse(location=loc, context=ctx)

        def r1_action(succ, context=None, **kwargs):
            span = succ[0].span
            ann = succ.context.outset.add(span.start, span.end, "NEW")
            return ann

        r1 = Rule(AnnAt("Ann2") >> AnnAt("Ann2"), r1_action)
        pampac = Pampac(r1)
        pampac.set_skip = "longest"
        pampac.set_select = "first"
        outset = doc.annset()
        ret = pampac.run(doc, annlist, outset=outset, debug=True)
        assert len(ret) == 1
        assert len(ret[0]) == 2
        idx, retlist = ret[0]
        assert idx == 1
        assert len(retlist) == 1
        a = retlist[0]
        assert isinstance(a, Annotation)
        assert a.start == 2
        assert a.end == 5
        assert a.type == "NEW"
        assert len(outset) == orig_len + 1
Esempio n. 6
0
    def test04(self):
        # Test multiple result matches with N, with and without the until clause

        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann")  # 0
        doc.annset().add(0, 2, "Ann")  # 1
        doc.annset().add(0, 2, "Token")  # 2
        doc.annset().add(2, 4, "Ann")  # 3
        doc.annset().add(2, 4, "Ann")  # 4
        doc.annset().add(4, 6, "Ann")  # 5
        doc.annset().add(4, 6, "Ann")  # 6
        doc.annset().add(4, 6, "Person")  # 7
        doc.annset().add(6, 8, "Ann")  # 8
        doc.annset().add(6, 8, "Ann")  # 9
        doc.annset().add(8, 10, "XXXX")  # 10
        annlist = list(doc.annset())

        # multiple Anns, single result from N: first
        # This should find 0,3,5
        ret = N(
            AnnAt("Ann", name="a1", matchtype="all"),
            min=2,
            max=3,
            select="all",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 3
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5

        # multiple Anns, all results from N
        # should return 0,1
        ret = N(
            AnnAt("Ann", name="a1", matchtype="all"),
            min=1,
            max=1,
            select="all",
            matchtype="all",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 2
        assert len(ret[0].data) == 1
        assert len(ret[1].data) == 1
        assert ret[0].data[0]["ann"].id == 0
        assert ret[1].data[0]["ann"].id == 1

        # multiple Anns, all results from N
        ret = N(
            AnnAt("Ann", name="a1", matchtype="all"),
            min=1,
            max=2,
            select="all",
            matchtype="all",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 4
        assert len(ret[0].data) == 2
        assert len(ret[1].data) == 2
        assert len(ret[2].data) == 2
        assert len(ret[3].data) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[1].data[0]["ann"].id == 0
        assert ret[1].data[1]["ann"].id == 4
        assert ret[2].data[0]["ann"].id == 1
        assert ret[2].data[1]["ann"].id == 3
        assert ret[3].data[0]["ann"].id == 1
        assert ret[3].data[1]["ann"].id == 4

        # multiple Anns, all results from N
        # just three for the first ann: 0,1,2
        ret = N(
            AnnAt(name="a1", matchtype="all"),
            min=1,
            max=1,
            select="all",
            matchtype="all",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 3
        assert len(ret[0].data) == 1
        assert len(ret[1].data) == 1
        assert len(ret[2].data) == 1
        assert ret[0].data[0]["ann"].id == 0
        assert ret[1].data[0]["ann"].id == 1
        assert ret[2].data[0]["ann"].id == 2

        # This should just find the Token as the first and only match!
        ret = N(AnnAt("Ann", name="a1", matchtype="all"),
                until=AnnAt("Token", name="t", matchtype="first"),
                min=0,
                max=3,
                select="all",
                matchtype="all").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 1
        assert ret[0].data[0]["ann"].id == 2

        # This should terminate with Person and find all paths that can lead up to PErson:
        # 0,3 0,4 1,3 1,4
        ret = N(AnnAt("Ann", name="a1", matchtype="all"),
                until=AnnAt("Person", name="t", matchtype="first"),
                min=1,
                max=3,
                select="all",
                matchtype="all").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 4
        assert len(ret[0].data) == 3
        assert len(ret[1].data) == 3
        assert len(ret[2].data) == 3
        assert len(ret[3].data) == 3
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 7
        assert ret[1].data[0]["ann"].id == 0
        assert ret[1].data[1]["ann"].id == 4
        assert ret[1].data[2]["ann"].id == 7
        assert ret[2].data[0]["ann"].id == 1
        assert ret[2].data[1]["ann"].id == 3
        assert ret[2].data[2]["ann"].id == 7
        assert ret[3].data[0]["ann"].id == 1
        assert ret[3].data[1]["ann"].id == 4
        assert ret[3].data[2]["ann"].id == 7
Esempio n. 7
0
    def test03(self):
        # Test single result matches with N, with and without the until clause

        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann")  # 0
        doc.annset().add(0, 2, "Ann")  # 1
        doc.annset().add(0, 2, "Token")  # 2
        doc.annset().add(2, 4, "Ann")  # 3
        doc.annset().add(2, 4, "Ann")  # 4
        doc.annset().add(4, 6, "Ann")  # 5
        doc.annset().add(4, 6, "Ann")  # 6
        doc.annset().add(4, 6, "Person")  # 7
        doc.annset().add(6, 8, "Ann")  # 8
        doc.annset().add(6, 8, "Ann")  # 9
        doc.annset().add(8, 10, "XXXX")  # 10
        annlist = list(doc.annset())

        # single Ann, single result from N
        # this should return annotation ids 0, 3, 5
        ret = N(
            AnnAt("Ann", name="a1", matchtype="first"),
            min=2,
            max=3,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 3
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5

        # Same as before, but with a name, so we should get one additional data for the whole sequence
        # with a span
        ret = N(AnnAt("Ann", name="a1", matchtype="first"),
                min=2,
                max=3,
                select="first",
                matchtype="first",
                name="n1").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 4
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5
        assert ret[0].data[3]["span"] == Span(0, 6)

        # single Ann, single result from N
        # this should return annotation ids 0, 3, 5, 8
        ret = N(
            AnnAt("Ann", name="a1", matchtype="first"),
            min=2,
            max=99,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 4
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5
        assert ret[0].data[3]["ann"].id == 8

        # single Ann, single result from N, with early stopping at Person
        # this should return annotation ids 0, 3, 7
        ret = N(
            AnnAt("Ann", name="a1", matchtype="first"),
            until=AnnAt("Person", name="p"),
            min=2,
            max=99,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 3
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 7

        # Try a match with min=0 and max=99 that does not succeed
        # single Ann, single result from N
        # this should return an empty list for data
        ret = N(
            AnnAt("NotThere", name="a1", matchtype="first"),
            min=0,
            max=99,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 0

        # Try a match with min=0 and max=99 that does not succeed
        # single Ann, single result from N
        # this should return an empty list for data
        ret = N(
            AnnAt("Ann", name="a1", matchtype="first"),
            min=0,
            max=99,
            select="first",
            matchtype="first",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 4
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[0].data[2]["ann"].id == 5
        assert ret[0].data[3]["ann"].id == 8
Esempio n. 8
0
    def test02(self):
        # Test multiple result matches

        doc = Document("Some test document")
        doc.annset().add(0, 2, "Ann")  # 0
        doc.annset().add(0, 2, "Ann")  # 1
        doc.annset().add(0, 2, "Token")  # 2
        doc.annset().add(2, 4, "Ann")  # 3
        doc.annset().add(2, 4, "Ann")  # 4
        annlist = list(doc.annset())

        # match all annotations at the document start
        ret = AnnAt(matchtype="all").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 3

        # match sequence Token/Ann, take first at each point
        # this should match annotation ids 2 and 3
        ret = Seq(AnnAt("Token", name="1"),
                  AnnAt("Ann", name="2")).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 2
        assert ret[0].data[0]["ann"].id == 2
        assert ret[0].data[1]["ann"].id == 3

        # match sequence Ann/Ann, take first at each point
        ret = Seq(AnnAt("Ann", name="1"), AnnAt("Ann",
                                                name="2")).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3

        # match sequence Ann/Ann, take first at each point, set useoffset=False so we do not skip to the
        # end offset of the previous before matching the next
        # In that case the next ann we match is the second one at offset 0
        ret = Seq(AnnAt("Ann", name="1"),
                  AnnAt("Ann", name="2", useoffset=False)).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 1
        assert len(ret[0].data) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 1

        # Make sure we get the correct set of annotations at position 0 and 2
        ret = AnnAt("Ann", name="a", matchtype="all").match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[1].data[0]["ann"].id == 1
        # ret.pprint()
        ret = AnnAt("Ann", name="a",
                    matchtype="all").match(doc,
                                           annlist,
                                           location=Location(2, 2))
        assert ret.issuccess()
        assert len(ret) == 2
        assert ret[0].data[0]["ann"].id == 3
        assert ret[1].data[0]["ann"].id == 4
        # ret.pprint()

        # Match sequence of two anns in order, take all results
        ret = Seq(
            AnnAt("Ann", name="1", matchtype="all"),
            AnnAt("Ann", name="2", matchtype="all"),
            select="all",
            matchtype="all",
        ).match(doc, annlist)
        assert ret.issuccess()
        assert len(ret) == 4
        assert len(ret[0].data) == 2
        assert len(ret[1].data) == 2
        assert len(ret[2].data) == 2
        assert len(ret[3].data) == 2
        assert ret[0].data[0]["ann"].id == 0
        assert ret[0].data[1]["ann"].id == 3
        assert ret[1].data[0]["ann"].id == 0
        assert ret[1].data[1]["ann"].id == 4
        assert ret[2].data[0]["ann"].id == 1
        assert ret[2].data[1]["ann"].id == 3
        assert ret[3].data[0]["ann"].id == 1
        assert ret[3].data[1]["ann"].id == 4
def stanfordnlp2gatenlp(
    stanfordnlpdoc,
    gatenlpdoc=None,
    setname="",
    word_type="Word",
    sentence_type="Sentence",
):
    """Convert a StanfordNLP document to a gatenlp document. If a gatenlp document is already
    provided, add the annotations from the StanfordNLP document to it. In this case the
    original gatenlpdoc is used and gets modified.

    Args:
      stanfordnlpdoc: a StanfordNLP document
      gatenlpdoc: if None, a new gatenlp document is created otherwise this
    document is added to. (Default value = None)
      setname: the annotation set name to which the annotations get added, empty string
    for the default annotation set.
      token_type: the annotation type to use for tokens
      sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
      word_type:  (Default value = "Word")

    Returns:
      the new or modified

    """
    if gatenlpdoc is None:
        retdoc = Document(stanfordnlpdoc.text)
    else:
        retdoc = gatenlpdoc
    toki2annid = {}
    annset = retdoc.annset(setname)
    # stanford nlp processes text in sentence chunks, so we do everything per sentence
    # NOTE: the stanford elements do not contain any text offsets, so we have to match and find
    # them ourselves. for this we keep an index to first character in the text which has not
    # been matched yet
    notmatchedidx = 0
    for sent in stanfordnlpdoc.sentences:
        # a sentence is a list of tokens and a list of words. Some tokens consist of several words.
        # dependency parsers are over words, so we create Word and Token annotations, but we only
        # set the features per Word annotation for now.
        offsetinfos = utils.match_substrings(
            stanfordnlpdoc.text[notmatchedidx:],
            sent.words,
            getstr=lambda x: x.text)
        idx2annid = {}
        for oinfo in offsetinfos:
            word = oinfo[2]
            fm = {
                "string": word.text,
                "lemma": word.lemma,
                "upos": word.upos,
                "xpos": word.xpos,
                "dependency_relation": word.dependency_relation,
                "governor": int(word.governor),
            }
            for feat in word.feats.split("|"):
                if feat and feat != "_":
                    k, v = feat.split("=")
                    # TODO: maybe try to detect and convert bool/int values
                    fm["feat_" + k] = v
            snlp_idx = int(word.index)
            annid = annset.add(oinfo[0] + notmatchedidx,
                               oinfo[1] + notmatchedidx, word_type, fm).id
            idx2annid[snlp_idx] = annid
        # create a sentence annotation from beginning of first word to end of last
        sentid = annset.add(
            offsetinfos[0][0] + notmatchedidx,
            offsetinfos[-1][1] + notmatchedidx,
            sentence_type,
        ).id
        # now replace the governor index with the corresponding annid, the governor index is
        # mapped to the sentence annotation
        idx2annid[0] = sentid
        for annid in list(idx2annid.values()):
            ann = annset.get(annid)
            gov = ann.features.get("governor")
            if gov is not None:
                ann.features["governor"] = idx2annid[gov]
        notmatchedidx = offsetinfos[-1][1] + notmatchedidx + 1
    return retdoc
Esempio n. 10
0
 def do_it(doc: Document, **kwargs):
     set1 = doc.annset("Set1")
     set1.add(2, 3, "test1", {"f1": "value1"})
Esempio n. 11
0
def stanza2gatenlp(
    stanzadoc,
    gatenlpdoc=None,
    setname="",
    token_type="Token",
    sentence_type="Sentence",
    add_entities=True,
    ent_prefix=None,
):
    """
    Convert a Stanford Stanza document to a gatenlp document. If a gatenlp document is already
    provided, add the annotations from the Stanford Stanza document to it. In this case the
    original gatenlpdoc is used and gets modified.

    Args:
      stanzadoc: a Stanford Stanza document
      gatenlpdoc: if None, a new gatenlp document is created otherwise this
         document is added to. (Default value = None)
      setname: the annotation set name to which the annotations get added, empty string
         for the default annotation set.
      token_type: the annotation type to use for tokens, if needed (Default value = "Token")
      sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
      add_entities: if True, add any entities as well (Default value = True)
      ent_prefix: if None, use the original entity type as annotation type, otherwise add the given string
    to the annotation type as a prefix. (Default value = None)

    Returns:
      the new or modified gatenlp document

    """
    if gatenlpdoc is None:
        retdoc = Document(stanzadoc.text)
    else:
        retdoc = gatenlpdoc
    toki2annid = {}
    annset = retdoc.annset(setname)
    # stanford nlp processes text in sentence chunks, so we do everything per sentence
    notmatchedidx = 0
    for sent in stanzadoc.sentences:
        # go through the tokens: in stanza, each token is a list of dicts, normally there is one dict
        # which also has the offset information in "misc", but for multiword tokens, there seems to be
        # one "header" dict for the range of words which has the offset info and NER label and then
        # one additional element per word which has all the rest.
        # For our purposes we create a list of dicts where for normal tokens we just copy the element, but for
        # multiword tokens we copy over something that has fake offsets and all the features
        newtokens = []
        for t in sent.tokens:
            t = t.to_dict()
            if len(t) == 1:
                newtokens.append(tok2tok(t[0]))
            else:
                tokinfo = tok2tok(t[0])
                words = t[1:]
                fm = tokinfo.get("fm")
                ner = fm.get("ner")
                text = fm.get("text")
                start = tokinfo["start"]
                end = tokinfo["end"]
                for i, w in enumerate(words):
                    tok = tok2tok(w)
                    tok["fm"]["ner"] = ner
                    tok["fm"]["token_text"] = text
                    os = min(start + i, end - 1)
                    tok["start"] = os
                    if i == len(words) - 1:
                        tok["end"] = end
                    else:
                        tok["end"] = os + 1
                    newtokens.append(tok)
        # print(f"\n!!!!!!DEBUG: newtokens={newtokens}")
        # now go through the new token list and create annotations
        idx2annid = {}  # map stanza word id to annotation id
        starts = []
        ends = []
        for t in newtokens:
            start = t["start"]
            end = t["end"]
            stanzaid = t["id"]
            starts.append(start)
            ends.append(end)
            annid = annset.add(start, end, token_type, features=t["fm"]).id
            idx2annid[str(stanzaid)] = annid
        # print(f"\n!!!!!!DEBUG: idx2annid={idx2annid}")
        # create a sentence annotation from beginning of first word to end of last
        sentid = annset.add(starts[0], ends[-1], sentence_type).id
        # now replace the head index with the corresponding annid, the head index "0" is
        # mapped to the sentence annotation
        idx2annid["0"] = sentid
        for annid in list(idx2annid.values()):
            ann = annset.get(annid)
            hd = ann.features.get("head")
            if hd is not None:
                hd = str(hd)
                headId = idx2annid.get(hd)
                if headId is None:
                    logger.error(
                        f"Could not find head id: {hd} for {ann} in document {gatenlpdoc.name}"
                    )
                else:
                    ann.features["head"] = idx2annid[hd]

    # add the entities
    if add_entities:
        for e in stanzadoc.entities:
            if ent_prefix:
                anntype = ent_prefix + e.type
            else:
                anntype = e.type
            annset.add(e.start_char, e.end_char, anntype)
    return retdoc
Esempio n. 12
0
def spacy2gatenlp(
    spacydoc,
    gatenlpdoc=None,
    setname="",
    token_type="Token",
    spacetoken_type="SpaceToken",
    sentence_type="Sentence",
    nounchunk_type="NounChunk",
    add_tokens=True,
    # add_spacetokens=True, # not sure how to do this yet
    add_ents=True,
    add_sents=True,
    add_nounchunks=True,
    add_dep=True,
    ent_prefix=None,
):
    """Convert a spacy document to a gatenlp document. If a gatenlp document is already
    provided, add the annotations from the spacy document to it. In this case the
    original gatenlpdoc is used and gets modified.

    Args:
      spacydoc: a spacy document
      gatenlpdoc: if None, a new gatenlp document is created otherwise this
    document is added to. (Default value = None)
      setname: the annotation set name to which the annotations get added, empty string
    for the default annotation set.
      token_type: the annotation type to use for tokens (Default value = "Token")
      spacetoken_type: the annotation type to use for space tokens (Default value = "SpaceToken")
      sentence_type: the annotation type to use for sentence anntoations (Default value = "Sentence")
      nounchunk_type: the annotation type to use for noun chunk annotations (Default value = "NounChunk")
      add_tokens: should annotations for tokens get added? If not, dependency parser
    info cannot be added either. (Default value = True)
      add_ents: should annotations for entities get added
      add_sents: should sentence annotations get added (Default value = True)
      add_nounchunks: should noun chunk annotations get added (Default value = True)
      add_dep: should dependency parser information get added (Default value = True)
      # add_spacetokens:  (Default value = True)
      # not sure how to do this yetadd_ents:  (Default value = True)
      ent_prefix:  (Default value = None)

    Returns:
      the new or modified

    """
    if gatenlpdoc is None:
        retdoc = Document(spacydoc.text)
    else:
        retdoc = gatenlpdoc
    toki2annid = {}
    annset = retdoc.annset(setname)
    for tok in spacydoc:
        from_off = tok.idx
        to_off = tok.idx + len(tok)
        is_space = tok.is_space
        fm = {
            "_i": tok.i,
            "is_alpha": tok.is_alpha,
            "is_bracket": tok.is_bracket,
            "is_currency": tok.is_currency,
            "is_digit": tok.is_digit,
            "is_left_punct": tok.is_left_punct,
            "is_lower": tok.is_lower,
            "is_oov": tok.is_oov,
            "is_punct": tok.is_punct,
            "is_quote": tok.is_quote,
            "is_right_punct": tok.is_right_punct,
            "is_sent_start": tok.is_sent_start,
            "is_space": tok.is_space,
            "is_stop": tok.is_stop,
            "is_title": tok.is_title,
            "is_upper": tok.is_upper,
            "lang": tok.lang_,
            "lemma": tok.lemma_,
            "like_email": tok.like_email,
            "like_num": tok.like_num,
            "like_url": tok.like_url,
            "orth": tok.orth,
            "pos": tok.pos_,
            "prefix": tok.prefix_,
            "prob": tok.prob,
            "rank": tok.rank,
            "sentiment": tok.sentiment,
            "tag": tok.tag_,
            "shape": tok.shape_,
            "suffix": tok.suffix_,
        }
        if spacydoc.is_nered and add_ents:
            fm["ent_type"] = tok.ent_type_
        if spacydoc.is_parsed and add_dep:
            fm["dep"] = tok.dep_
        if tok.is_space:
            anntype = spacetoken_type
        else:
            anntype = token_type
        annid = annset.add(from_off, to_off, anntype, fm).id
        toki2annid[tok.i] = annid
        # print("Added annotation with id: {} for token {}".format(annid, tok.i))
        ws = tok.whitespace_
        if len(ws) > 0:
            annset.add(to_off, to_off + len(ws), spacetoken_type, {"is_space": True})
    # if we have a dependency parse, now also add the parse edges
    if spacydoc.is_parsed and add_tokens and add_dep:
        for tok in spacydoc:
            ann = annset.get(toki2annid[tok.i])
            ann.features["head"] = toki2annid[tok.head.i]
            ann.features["left_edge"] = toki2annid[tok.left_edge.i]
            ann.features["right_edge"] = toki2annid[tok.right_edge.i]
    if spacydoc.ents and add_ents:
        for ent in spacydoc.ents:
            if ent_prefix:
                entname = ent_prefix + ent.label_
            else:
                entname = ent.label_
            annset.add(ent.start_char, ent.end_char, entname, {"lemma": ent.lemma_})
    if spacydoc.sents and add_sents:
        for sent in spacydoc.sents:
            annset.add(sent.start_char, sent.end_char, sentence_type, {})
    if spacydoc.noun_chunks and add_nounchunks:
        for chunk in spacydoc.noun_chunks:
            annset.add(chunk.start_char, chunk.end_char, nounchunk_type, {})
    return retdoc
Esempio n. 13
0
    if not os.path.exists(args.indir):
        raise Exception("Does not exist: {}".format(args.indir))
    if not os.path.exists(args.outdir):
        raise Exception("Does not exist: {}".format(args.outdir))

    gen = Path(args.indir).rglob("*.bdocjs")

    total_readorig = 0
    total_save = 0
    total_read = 0
    newfiles = []
    for f in gen:
        relpath = str(f)
        start = time.time()
        doc = Document.load(relpath, fmt=args.infmt)
        total_readorig += time.time() - start
        relpath = relpath.replace(os.path.sep, "_")
        relpath = relpath.replace(".bdocjs", args.fmt)
        newfile = os.path.join(args.outdir, relpath)
        newfiles.append(newfile)
        start = time.time()
        doc.save(newfile, fmt=args.fmt)
        total_save += time.time() - start

    for f in newfiles:
        start = time.time()
        doc = Document.load(f, fmt=args.fmt)
        total_read += time.time() - start

    n = len(newfiles)
Esempio n. 14
0
def annotate(anno_list,
             patient_id,
             start_date,
             num_of_day,
             whether_print=True,
             anno_specific=False):
    input_for_anno_path = 'Clinical_Note/' + str(
        patient_id) + '/output_no_sid/' + str(start_date) + '_o.txt'
    # Load document and create a new anno set
    doc = Document.load(input_for_anno_path)
    annset_target = doc.annset()

    def find_pos(target_string):

        if target_string == "V/S":
            f = open(input_for_anno_path, "r")
            tmp_cnt = f.read()
            if tmp_cnt.find("V/S") is not -1:

                start_pos = tmp_cnt.find("V/S") + 4
                end_pos = tmp_cnt.find("\n", start_pos)
                return start_pos, end_pos
            else:
                return 0, 0

        elif target_string == "Vital signs":
            f = open(input_for_anno_path, "r")
            tmp_cnt = f.read()
            if tmp_cnt.find("Vital signs") is not -1:
                start_pos = tmp_cnt.find("Vital signs") + 14
                end_pos = tmp_cnt.find("\n", start_pos)
                return start_pos, end_pos
            else:
                return 0, 0

        elif target_string == "VS Comment":
            f = open(input_for_anno_path, "r")
            tmp_cnt = f.read()
            if tmp_cnt.find("VS Comment") is not -1 or tmp_cnt.find(
                    "VS comment") is not -1:
                if tmp_cnt.find("VS Comment") is not -1:
                    start_pos = tmp_cnt.find("VS Comment") + 10
                    end_pos = len(tmp_cnt)
                elif tmp_cnt.find("VS comment") is not -1:
                    start_pos = tmp_cnt.find("VS comment") + 10
                    end_pos = len(tmp_cnt)
                return start_pos, end_pos
            else:
                return 0, 0
        elif target_string == "P":
            # 先判斷第一行有沒有以S開頭
            f1 = open(input_for_anno_path, "r")
            f2 = open(input_for_anno_path, "r")
            exam_cnt = f1.readline()
            tmp_cnt = f2.read()
            if exam_cnt.find("S:") is not -1:
                if tmp_cnt.find("P:") is not -1:
                    start_pos = tmp_cnt.find("P:") + 2
                    # sen_splir = tmp_cmt[tmp_cmt.find("P "):len(tmp_cmt)]
                    end_pos = len(tmp_cnt)
                    return start_pos, end_pos
                else:
                    return 0, 0
            elif (tmp_cnt.find("Plan")
                  is not -1) and (tmp_cnt.find("VS comment") is -1
                                  and tmp_cnt.find("VS Comment") is -1):
                start_pos = tmp_cnt.find("Plan") + 4
                end_pos = len(tmp_cnt)
                return start_pos, end_pos

            else:
                return 0, 0

        else:
            return 0, 0

    # 把位置計算方式轉換
    def rowcol_to_position(line, start_pos, length):
        f = open(input_for_anno_path, "r")
        cur_pos = 0
        # cur_pos = 7
        cur_line = 1
        tmp_content = f.readline()

        # 一直執行迴圈,讀入新行
        while True:
            if tmp_content:

                if (line == 1):
                    break

                elif (cur_line < line):
                    cur_pos += len(tmp_content)
                    cur_line += 1
                    tmp_content = f.readline()

                else:
                    break
            else:
                break
        start_pos = start_pos + cur_pos
        end_pos = start_pos + length
        return start_pos, end_pos

    #------end of rowcol 2 position------

    # 把anno list 中的元素都標註完
    while anno_list:
        # 判斷是否只有一個span
        if anno_list[0][1].find(';') is not -1:

            # anno_list[0][1].replace(';', ',')
            pos_start1 = int(anno_list[0][1].split(';')[0].split('/')[0])
            len1 = int(anno_list[0][1].split(';')[0].split('/')[1])
            start_pos1, end_pos1 = rowcol_to_position(int(anno_list[0][0]),
                                                      pos_start1, len1)
            annset_target.add(start_pos1, end_pos1, "New Info")

            pos_start2 = int(anno_list[0][1].split(';')[1].split('/')[0])
            len2 = int(anno_list[0][1].split(';')[1].split('/')[1])
            start_pos2, end_pos2 = rowcol_to_position(int(anno_list[0][0]),
                                                      pos_start2, len2)
            annset_target.add(start_pos2, end_pos2, "New Info")

        elif anno_list[0][1].find(',') is -1:
            # 行 = int(anno_list[0][0])
            # 頭位置  = int(anno_list[0][1].split('/')[0])
            # vary 長度 = int(anno_list[0][1].split('/')[1])
            start_pos, end_pos = rowcol_to_position(
                int(anno_list[0][0]), int(anno_list[0][1].split('/')[0]),
                int(anno_list[0][1].split('/')[1]))
            annset_target.add(start_pos, end_pos, "New Info")
        elif (anno_list[0][1].find(',')
              is not -1) and (anno_list[0][1].find('[') is not -1):
            info_process = anno_list[0][1].replace('[', '',
                                                   2).replace(']', '', 2)
            pos_start1 = int(info_process.split(',')[0].split('/')[0])
            len1 = int(info_process.split(',')[0].split('/')[1])
            start_pos1, end_pos1 = rowcol_to_position(int(anno_list[0][0]),
                                                      pos_start1, len1)
            annset_target.add(start_pos1, end_pos1, "New Info")

            pos_start2 = int(info_process.split(',')[1].split('/')[0])
            len2 = int(info_process.split(',')[1].split('/')[1])
            start_pos2, end_pos2 = rowcol_to_position(int(anno_list[0][0]),
                                                      pos_start2, len2)
            annset_target.add(start_pos2, end_pos2, "New Info")

        else:  #(anno_list[0][1].find(',') is not -1):
            pos_start1 = int(anno_list[0][1].split(',')[0].split('/')[0])
            len1 = int(anno_list[0][1].split(',')[0].split('/')[1])
            start_pos1, end_pos1 = rowcol_to_position(int(anno_list[0][0]),
                                                      pos_start1, len1)
            annset_target.add(start_pos1, end_pos1, "New Info")

            pos_start2 = int(anno_list[0][1].split(',')[1].split('/')[0])
            len2 = int(anno_list[0][1].split(',')[1].split('/')[1])
            start_pos2, end_pos2 = rowcol_to_position(int(anno_list[0][0]),
                                                      pos_start2, len2)
            annset_target.add(start_pos2, end_pos2, "New Info")

        del anno_list[0]
    if anno_specific:
        s, e = find_pos("P")
        if (s is not 0) or (e is not 0):
            annset_target.add(s, e, "New Info")
        s, e = find_pos("VS Comment")
        if (s is not 0) or (e is not 0):
            annset_target.add(s, e, "New Info")
        s, e = find_pos("V/S")
        if (s is not 0) or (e is not 0):
            annset_target.add(s, e, "New Info")
        s, e = find_pos("Vital signs")
        if (s is not 0) or (e is not 0):
            annset_target.add(s, e, "New Info")

    if whether_print:
        print("標注結果為:", annset_target)
    save_path = 'Clinical_Note/' + str(patient_id) + '/annotated/' + str(
        start_date) + '_' + str(num_of_day) + '.bdocjs'
    doc.save(save_path)
    print("annotate complete!" + str(patient_id), str(start_date),
          str(num_of_day))
Esempio n. 15
0
def validate(
    patient_id,
    start_date,
    num_of_day,
):
    input_path = 'Clinical_Note/' + str(patient_id) + '/annotated/' + str(
        start_date) + '_' + str(num_of_day) + '.bdocjs'
    # 這邊先用絕對路徑之後會改
    # 黃金準則
    # doc_gold = Document.load('/home/feng/'+str(start_date)+'_1_o.xml', fmt = "gatexml")
    doc_gold = Document.load('Gold_Standard/' + str(patient_id) + '/' +
                             str(start_date) + '_o.xml',
                             fmt="gatexml")

    annset_gold = doc_gold.annset('').with_type('New Info')
    annset_whole_tk = doc_gold.annset('Token').with_type('Token')

    # mm 全標
    doc_mm = Document.load('Clinical_Note/' + str(patient_id) + '/annotated/' +
                           str(start_date) + '_0' + '.bdocjs')
    annset_mm = doc_mm.annset('')

    annset_mm_mapping_tk = doc_mm.annset("mmtk")
    tmp3 = annset_mm.copy()
    # --------------annset_mm 前處理----------

    annset_mmanno_dup = doc_mm.annset("duplicate_mmanno")
    while annset_mm.size:
        annset_mmanno_first = annset_mm.first()
        annset_mm.remove(annset_mmanno_first)
        for i in annset_mm:
            while annset_mmanno_first.iswithin(
                    i) or annset_mmanno_first.iscovering(
                        i) or annset_mmanno_first.iscoextensive(i):
                annset_mmanno_dup.add_ann(i, i.id)
                annset_mm.remove(i)
                break
            break
    annset_mm = tmp3.copy()
    annset_mm.remove(annset_mmanno_dup)
    # ----------------------------------

    annset_mm_tk = 0

    for i in annset_mm:
        for j in annset_whole_tk:
            if i.iscovering(j) or i.iswithin(j):
                annset_mm_mapping_tk.add_ann(j)
                annset_mm_tk += 1
    tmp = annset_mm_mapping_tk.copy()
    # ------------annset_mm_mapping_tk 前處理---------------
    mm_duplicate = 0
    annset_mm_dup = doc_mm.annset("duplicate_mmtk")
    while annset_mm_mapping_tk.size:
        annset_mmtki_first = annset_mm_mapping_tk.first()
        annset_mm_mapping_tk.remove(annset_mmtki_first)
        for i in annset_mm_mapping_tk:
            while annset_mmtki_first.iswithin(
                    i) or annset_mmtki_first.iscovering(
                        i) or annset_mmtki_first.iscoextensive(i):
                annset_mm_dup.add_ann(i, i.id)
                annset_mm_mapping_tk.remove(i)
                break
            break
    annset_mm_mapping_tk = tmp.copy()
    annset_mm_mapping_tk.remove(annset_mm_dup)
    # ----------------------------------

    # 系統標注的
    doc_target = Document.load(input_path)
    # 系統標注的 annotation set
    annset_target = doc_target.annset('')
    tmp2 = annset_target.copy()
    # ----------annset_target 前處理-------------

    annset_tgano_dup = doc_target.annset("duplicate_tg_anno")
    while annset_target.size:
        annset_tg_first = annset_target.first()
        annset_target.remove(annset_tg_first)
        for i in annset_target:
            while annset_tg_first.iswithin(i) or annset_tg_first.iscovering(
                    i) or annset_tg_first.iscoextensive(i):
                annset_tgano_dup.add_ann(i, i.id)
                annset_target.remove(i)
                break
            break

    annset_target = tmp2.copy()

    annset_target.remove(annset_tgano_dup)
    # ----------------------------------

    # 文本共幾個 token
    text_all_tk = annset_whole_tk.size

    # 系統標注的 annotation set 透過 mm 轉成 token
    # 算出共標注幾個 tk (mm base)
    annset_target_tk = 0
    annset_target_tk2 = doc_target.annset("tgtk")

    for ann_t in annset_target:
        for ann_m in annset_mm_mapping_tk:
            if ann_t.iscovering(ann_m) or ann_t.iswithin(ann_m):
                annset_target_tk2.add_ann(ann_m)
                annset_target_tk += 1

    tmp4 = annset_target_tk2.copy()
    # annset_tg_tk2_orgset2 = annset_target_tk2.copy()
    # ----------annset_target_tk2 前處理------------------
    target_duplicate = 0
    annset_tg_dup = doc_target.annset("duplicate_tg")

    while annset_target_tk2.size:
        annset_tgtk2_first = annset_target_tk2.first()
        annset_target_tk2.remove(annset_tgtk2_first)
        for i in annset_target_tk2:
            while annset_tgtk2_first.iswithin(
                    i) or annset_tgtk2_first.iscovering(
                        i) or annset_tgtk2_first.iscoextensive(i):
                annset_tg_dup.add_ann(i, i.id)
                annset_target_tk2.remove(i)
                break
            break
    annset_target_tk2 = tmp4.copy()
    annset_target_tk2.remove(annset_tg_dup)

    # ----------------------------------

    # 黃金準則標注的 annotation set 轉成 token
    # annset_gold_tk = 0
    # for i in annset_gold:
    #     for j in annset_tk:
    #         if i.iscovering(j):
    #             annset_gold_tk += 1
    annset_gold_tk = 0
    annset_gold_tk2 = doc_gold.annset("goldtk")

    for i in annset_gold:
        for j in annset_mm_mapping_tk:
            if i.iscovering(j) or i.iswithin(j):
                annset_gold_tk2.add_ann(j.copy())
                annset_gold_tk += 1
    # print("annset_gold_tk", annset_gold_tk)

    annset_score = doc_target.annset("score")
    score = 0

    # 計分區
    for i in annset_target_tk2:
        for j in annset_gold_tk2:
            if i.iscoextensive(j):
                annset_score.add_ann(i.copy())
                score += 1

    score_tk = 0
    for i in annset_score:
        for j in annset_mm_mapping_tk:
            if i.iscovering(j) or i.iswithin(j):
                score_tk += 1

    # ----------annset_score 前處理------------------
    score_duplicate = 0
    tmp5 = annset_score.copy()
    annset_sc_dup = doc_target.annset("duplicate")
    annset_score_original = annset_score.size  #共有幾個(含重複)
    while annset_score.size:
        annset_score_first = annset_score.first()
        annset_score.remove(annset_score_first)
        for i in annset_score:
            while annset_score_first.iswithin(
                    i) or annset_score_first.iscovering(
                        i) or annset_score_first.iscoextensive(i):
                annset_sc_dup.add_ann(i, i.id)
                annset_score.remove(i)
                break
            break
    annset_score = tmp5.copy()
    annset_score.remove(annset_sc_dup)
    # annset_score_cali = annset_score_original - annset_sc_dup.size
    # ----------------------------------

    print(
        "黃金準則token(對照mm)",
        annset_gold_tk2.size,
        "系統標註筆數(對照mm 不重複)",
        annset_target_tk2.size - math.sqrt(annset_target_tk2.size),
        "標註得分token",
        annset_score_original,
        "校正後得分:",
        annset_score.size,
    )

    if annset_gold_tk is not 0 and annset_target_tk2.size is not 0:
        print(
            "Precision:",
            round(
                (annset_score.size /
                 (annset_target_tk2.size - math.sqrt(annset_target_tk2.size)) *
                 100), 2), "Recall:",
            round((annset_score.size / annset_gold_tk2.size * 100), 2))
        # 寫入報告
        f = open("Clinical_Note/Result/" + str(patient_id) + '_result.csv',
                 'a')
        str_in = str(start_date) + "," + str(num_of_day)
        f.writelines(str_in)

        if round(
            (annset_score_original * 2 /
             (annset_target_tk2.size - math.sqrt(annset_target_tk2.size)) *
             100), 2) > 100:
            res_p = 100
        else:
            res_p = round(
                (annset_score_original * 2 /
                 (annset_target_tk2.size - math.sqrt(annset_target_tk2.size)) *
                 100), 2)

        if round((annset_score.size / annset_gold_tk2.size * 100), 2) > 100:
            res_r = 100
        else:
            res_r = round((annset_score.size / annset_gold_tk2.size * 100), 2)

        str_in = "," + str(res_p) + "," + str(res_r) + '\n'
        f.writelines(str_in)
Esempio n. 16
0
#!/usr/bin/env python
"""
Simple demo implementation for generating HTML Viewer for a bdoc document
"""
import sys
import os
import argparse
from gatenlp import Document

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("infile", help="input bdoc document")
    parser.add_argument("outfile", help="output html file")
    parser.add_argument("--offline",
                        action="store_true",
                        help="Generate for offline use")
    parser.add_argument(
        "--notebook",
        action="store_true",
        help="Generate for HTML embedding in notebook",
    )
    args = parser.parse_args()

    doc = Document.load(args.infile, fmt="json")
    html = doc.save_mem(fmt="html-ann-viewer",
                        offline=args.offline,
                        notebook=args.notebook)

    with open(args.outfile, "wt", encoding="utf-8") as outfp:
        outfp.write(html)