コード例 #1
0
ファイル: stanfordner.py プロジェクト: neeraj196/IHP
    def process_sentence(self, out, sid, results):
        sentence = results.corpus.documents['.'.join(sid.split('.')[:-1])].get_sentence(sid)
        if sentence is None:
            print (sid)
            print ("not found!")
            print (results.corpus.documents['.'.join(sid.split('.')[:-1])])
            print ([s.sid for s in results.corpus.documents['.'.join(sid.split('.')[:-1])].sentences])
            sys.exit()
        tagged_tokens = self.tag_tokens(out, sentence)
        #print tagged_tokens[0][2].text, "**************************************************************************************************************"
        sentence.tagged = tagged_tokens
        new_entity = None
        for t in tagged_tokens:
            text, tag, token = t
            if tag.startswith("S"):
                single_entity = create_entity(tokens=[token],
                                                      sid=sentence.sid, did=sentence.did,
                                                      text=text, score=1, etype=self.etype)
                eid = sentence.tag_entity(start=token.start, end=token.end, etype=self.etype,
                                            entity=single_entity, source=self.path)
                single_entity.eid = eid
                results.entities[eid] = single_entity # deepcopy
                #logging.info("new single entity: {}".format(single_entity))
            elif tag.startswith("B"):
                new_entity = create_entity(tokens=[token],
                                                   sid=sentence.sid, did=sentence.did,
                                                   text=text, score=1, etype=self.etype)
            elif tag.startswith("I"):
                if not new_entity:
                    logging.info("starting with inside...")
                    new_entity = create_entity(tokens=[token],
                                                   sid=sentence.sid, did=sentence.did,
                                                   text=text, score=1, etype=self.etype)
                else:
                    new_entity.tokens += [token]
            elif tag.startswith("E"):
                if not new_entity:
                    new_entity = create_entity(tokens=[token],
                                               sid=sentence.sid, did=sentence.did,
                                               text=text,
                                               score=1, etype=self.etype)
                    logging.debug("started from a end: {0}".format(new_entity))
                else:
                    new_entity.tokens += [token]
                    new_entity.text= sentence.text[new_entity.tokens[0].start:new_entity.tokens[-1].end]
                    new_entity.end = new_entity.start + len(new_entity.text)
                    new_entity.dend = new_entity.dstart + len(new_entity.text)

                #logging.info("%s end: %s" % (new_entity.sid, str(new_entity)))
                #logging.debug("found the end: %s", ''.join([t.text for t in new_entity.tokens]))
                eid = sentence.tag_entity(start=new_entity.tokens[0].start,
                                          end=new_entity.tokens[-1].end, etype=self.etype,
                                          entity=new_entity, source=self.path)
                new_entity.eid = eid
                results.entities[eid] = new_entity # deepcopy
                new_entity = None
                logging.debug("completed entity:{}".format(results.entities[eid]))
        return results
コード例 #2
0
ファイル: crfsuitener.py プロジェクト: AndreLamurias/IBEnt
 def process_sentence(self, predicted, isent, results):
     sentence = results.corpus.get_sentence(self.sids[isent])
     if len(predicted) != len(sentence.tokens):
         print "len(predicted) != len(sentence.tokens); {}!={}".format(len(predicted), len(sentence.tokens))
         sys.exit()
     if sentence is None:
         print self.sids[isent]
         print "not found!"
         sys.exit()
     sentence.tagged = predicted
     new_entity = None
     for it, t in enumerate(predicted):
         token = sentence.tokens[it]
         if t == "single":
             single_entity = create_entity(tokens=[token],
                                   sid=sentence.sid, did=sentence.did,
                                   text=token.text, score=self.scores[isent][it], etype=self.etype)
             eid = sentence.tag_entity(start=token.start, end=token.end, etype=self.etype,
                                         entity=single_entity, source=self.path)
             single_entity.eid = eid
             results.entities[eid] = single_entity # deepcopy
         elif t == "start":
             new_entity = create_entity(tokens=[token],
                                                sid=sentence.sid, did=sentence.did,
                                                text=token.text, score=self.scores[isent][it], etype=self.etype)
         elif t == "middle":
             if not new_entity:
                 logging.info("starting with inside...")
                 new_entity = create_entity(tokens=[token],
                                                sid=sentence.sid, did=sentence.did,
                                                text=token.text, score=self.scores[isent][it], etype=self.etype)
             else:
                 new_entity.tokens.append(token)
                 new_entity.score += self.scores[isent][it]
         elif t == "end":
             if not new_entity:
                 new_entity = create_entity(tokens=[token],
                                            sid=sentence.sid, did=sentence.did,
                                            text=token.text,
                                            score=self.scores[isent][it], etype=self.etype)
                 logging.debug("started from a end: {0}".format(new_entity))
             else:
                 new_entity.tokens.append(token)
                 new_entity.text = sentence.text[new_entity.tokens[0].start:new_entity.tokens[-1].end]
                 new_entity.end = new_entity.start + len(new_entity.text)
                 new_entity.dend = new_entity.dstart + len(new_entity.text)
                 new_entity.score += self.scores[isent][it]
                 new_entity.score = new_entity.score/len(new_entity.tokens)
             #logging.info("%s end: %s" % (new_entity.sid, str(new_entity)))
             #logging.debug("found the end: %s", ''.join([t.text for t in new_entity.tokens]))
             eid = sentence.tag_entity(new_entity.tokens[0].start, new_entity.tokens[-1].end, self.etype,
                                       entity=new_entity, source=self.path)
             new_entity.eid = eid
             results.entities[eid] = new_entity # deepcopy
             new_entity = None
             #logging.debug("completed entity:{}".format(results.entities[eid]))
     return results
コード例 #3
0
    def process_sentence(self, out, sentence):
        sentence_entities = {}
        did = sentence.did
        sid = sentence.sid
        tagged_tokens = self.split_tag_tokens(out, sentence)
        new_entity = None
        for t in tagged_tokens:
            text, tag, token = t
            if tag.startswith("S"):
                single_entity = create_entity(tokens=[token],
                                                      sid=sid, did=did,
                                                      text=text, score=1, etype=self.etype)
                eid = sentence.tag_entity(start=token.start, end=token.end, etype=self.etype,
                                            entity=single_entity, source=self.path)
                single_entity.eid = eid
                #results.entities[eid] = single_entity # deepcopy
                sentence_entities[eid] = single_entity
                #logging.info("new single entity: {}".format(single_entity))
            elif tag.startswith("B"):
                new_entity = create_entity(tokens=[token],
                                                   sid=sid, did=did,
                                                   text=text, score=1, etype=self.etype)
            elif tag.startswith("I"):
                if not new_entity:
                    logging.info("starting with inside...")
                    new_entity = create_entity(tokens=[token],
                                                   sid=sid, did=did,
                                                   text=text, score=1, etype=self.etype)
                else:
                    new_entity.tokens += [token]
            elif tag.startswith("E"):
                if not new_entity:
                    new_entity = create_entity(tokens=[token],
                                               sid=sid, did=did,
                                               text=text,
                                               score=1, etype=self.etype)
                    logging.debug("started from a end: {0}".format(new_entity))
                else:
                    new_entity.tokens += [token]
                    new_entity.text= sentence.text[new_entity.tokens[0].start:new_entity.tokens[-1].end]
                    new_entity.end = new_entity.start + len(new_entity.text)
                    new_entity.dend = new_entity.dstart + len(new_entity.text)

                #logging.info("%s end: %s" % (new_entity.sid, str(new_entity)))
                #logging.debug("found the end: %s", ''.join([t.text for t in new_entity.tokens]))
                eid = sentence.tag_entity(start=new_entity.tokens[0].start,
                                          end=new_entity.tokens[-1].end, etype=self.etype,
                                          entity=new_entity, source=self.path)
                new_entity.eid = eid
                #results.entities[eid] = new_entity # deepcopy
                sentence_entities[eid] = new_entity
                new_entity = None
                try:
                    logging.debug("completed entity:{}".format(sentence_entities[eid]))
                except UnicodeDecodeError:
                    pass
        return sentence_entities
コード例 #4
0
 def process_entity(self, line, sentence):
     """
     Process one line of BANNER output
     :param line: list of elements of the line to be processed
     :param sentence: Sentence object associated with line
     :return:
     """
     # sid, genetype, start, end, etext = line.strip().split("\t")
     print line
     sid, genetype, start, end, etext = line
     tokens = sentence.find_tokens_between(int(start),
                                           int(end),
                                           relativeto="sent")
     if tokens:
         new_entity = create_entity(tokens=tokens,
                                    sid=sentence.sid,
                                    did=sentence.did,
                                    text=etext,
                                    score=1,
                                    etype=self.etype)
         eid = sentence.tag_entity(start=new_entity.tokens[0].start,
                                   end=new_entity.tokens[-1].end,
                                   etype=self.etype,
                                   entity=new_entity,
                                   source=self.path)
         new_entity.eid = eid
         return sentence, new_entity
     else:
         logging.info("No tokens found: {}-{}".format(start, end))
         logging.info(sentence.text)
         return sentence, None
コード例 #5
0
ファイル: banner.py プロジェクト: AndreLamurias/IBEnt
 def process_entity(self, line, sentence):
     """
     Process one line of BANNER output
     :param line: list of elements of the line to be processed
     :param sentence: Sentence object associated with line
     :return:
     """
     # sid, genetype, start, end, etext = line.strip().split("\t")
     print line
     sid, genetype, start, end, etext = line
     tokens = sentence.find_tokens_between(int(start), int(end), relativeto="sent")
     if tokens:
         new_entity = create_entity(tokens=tokens,
                                    sid=sentence.sid, did=sentence.did,
                                    text=etext, score=1, etype=self.etype)
         eid = sentence.tag_entity(start=new_entity.tokens[0].start,
                                   end=new_entity.tokens[-1].end, etype=self.etype,
                                   entity=new_entity, source=self.path)
         new_entity.eid = eid
         return sentence, new_entity
     else:
         logging.info("No tokens found: {}-{}".format(start, end))
         logging.info(sentence.text)
         return sentence, None
コード例 #6
0
ファイル: sentence.py プロジェクト: lasigeBioTM/IBEnt
    def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None,
                   text=None, **kwargs):
        """Find the tokens that match this entity. start and end are relative to the sentence.
           Totalchars is the offset of the sentence on the document."""
        tlist = []
        # print self.tokens
        nextword = ""
        for t in self.tokens:
            # discard tokens that intersect the entity for now
            # print t.start, t.end, t.text
            if t.start >= start and t.end <= end:
                tlist.append(t)
            elif (t.start == start and t.end > end) or (t.start < start and t.end == end):
                tlist.append(t)
                break
            elif t.start == end+1:
                nextword = t.text
            exclude_list = []
            if exclude is not None:
                for t in tlist:
                    for e in exclude:
                        if t.start >= e[0] and t.end <= e[1]-1:
                            exclude_list.append(t.tid)
            tlist = [t for t in tlist if t.tid not in exclude_list]
        if tlist:
            if exclude is not None:
                newtext = self.text[tlist[0].start:exclude[0][0]]
                #print self.text[exclude[0][0]:exclude[0][1]], exclude
                last_exclude = exclude[0]
                for e in exclude[1:]:
                    if not self.text[e[1]].isspace() and not newtext[-1].isspace():
                        newtext += " "
                    newtext += self.text[last_exclude[1]:e[0]]
                    last_exclude = e
                if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace():
                    newtext += " "
                newtext += self.text[exclude[-1][1]:tlist[-1].end]
                # self.text[exclude[1]:tlist[-1].end]
            else:
                newtext = self.text[tlist[0].start:tlist[-1].end]
            if entity:
                entity.text = newtext
            if "text" in kwargs and newtext != kwargs["text"]:
                if newtext not in kwargs["text"] and kwargs["text"] not in newtext:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                                                                                   start, end, self.sid,
                                                                                                   self.text))
                    logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"]))
                    #sys.exit()
                    #return None
                else:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                 start, end, self.sid, self.text))
                    #for t in self.tokens:
                    #    print (t.start, t.end, t.text),
                    #print
                    #return None
                    # print exclude, self.text[tlist[0].start:tlist[-1].end]
            #     print "tokens found:", [t.text for t in tlist]
                    # sys.exit()
            # else:
            # print "found the tokens!", start, end, kwargs["text"], self.sid

            if self.entities.elist.get(source):
                eid = self.sid + ".e" + str(len(self.entities.elist[source]))
            else:
                eid = self.sid + ".e0"
            subtype = kwargs.get("subtype", "all")
            if entity is None:
                if "text" in kwargs:
                    newtext = kwargs["text"]
                kwargs["eid"] = eid
                entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"),
                                       etype=etype, eid=eid, subtype=kwargs.get("subtype"),
                                       original_id=kwargs.get("original_id"), nextword=nextword)

                entity.normalize()
            self.entities.add_entity(entity, source)
            # print self.entities.elist["goldstandard"]
            self.label_tokens(tlist, source, etype, subtype=subtype)
            #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid,
            #                                                                 len(self.entities.elist[source])))
            return eid
        else:
            logging.info("no tokens found:")
            logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text")))
            logging.info(str([(t.start, t.end, t.text) for t in self.tokens]))
コード例 #7
0
ファイル: crfsuitener.py プロジェクト: sdyz5210/IHP
    def process_sentence(self, predicted, sid, results):
        sentence = results.corpus.documents['.'.join(
            sid.split('.')[:-1])].get_sentence(sid)
        if len(predicted) != len(sentence.tokens):
            print "len(predicted) != len(sentence.tokens); {}!={}".format(
                len(predicted), len(sentence.tokens))
            sys.exit()
        if sentence is None:
            print sid
            print "not found!"
            sys.exit()
        sentence.tagged = predicted
        new_entity = None
        for it, t in enumerate(predicted):
            token = sentence.tokens[it]
            if t == "single":
                single_entity = create_entity(tokens=[token],
                                              sid=sentence.sid,
                                              did=sentence.did,
                                              text=token.text,
                                              score=1,
                                              etype=self.etype)
                eid = sentence.tag_entity(start=token.start,
                                          end=token.end,
                                          etype=self.etype,
                                          entity=single_entity,
                                          source=self.path)
                single_entity.eid = eid
                results.entities[eid] = single_entity  # deepcopy
            elif t == "start":
                new_entity = create_entity(tokens=[token],
                                           sid=sentence.sid,
                                           did=sentence.did,
                                           text=token.text,
                                           score=1,
                                           etype=self.etype)
            elif t == "middle":
                if not new_entity:
                    logging.info("starting with inside...")
                    new_entity = create_entity(tokens=[token],
                                               sid=sentence.sid,
                                               did=sentence.did,
                                               text=token.text,
                                               score=1,
                                               etype=self.etype)
                else:
                    new_entity.tokens.append(token)
            elif t == "end":
                if not new_entity:
                    new_entity = create_entity(tokens=[token],
                                               sid=sentence.sid,
                                               did=sentence.did,
                                               text=token.text,
                                               score=1,
                                               etype=self.etype)
                    logging.debug("started from a end: {0}".format(new_entity))
                else:
                    new_entity.tokens.append(token)
                    new_entity.text = sentence.text[
                        new_entity.tokens[0].start:new_entity.tokens[-1].end]
                    new_entity.end = new_entity.start + len(new_entity.text)
                    new_entity.dend = new_entity.dstart + len(new_entity.text)

                #logging.info("%s end: %s" % (new_entity.sid, str(new_entity)))
                #logging.debug("found the end: %s", ''.join([t.text for t in new_entity.tokens]))
                eid = sentence.tag_entity(new_entity.tokens[0].start,
                                          new_entity.tokens[-1].end,
                                          self.etype,
                                          entity=new_entity,
                                          source=self.path)
                new_entity.eid = eid
                results.entities[eid] = new_entity  # deepcopy
                new_entity = None
                #logging.debug("completed entity:{}".format(results.entities[eid]))
        return results
コード例 #8
0
ファイル: sentence.py プロジェクト: AndreLamurias/IBEnt
    def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None, **kwargs):
        """Find the tokens that match this entity. start and end are relative to the sentence.
           Totalchars is the offset of the sentence on the document."""
        tlist = []
        # print self.tokens
        nextword = ""
        for t in self.tokens:
            # discard tokens that intersect the entity for now
            # print t.start, t.end, t.text
            if t.start >= start and t.end <= end:
                tlist.append(t)
            elif (t.start == start and t.end > end) or (t.start < start and t.end == end):
                tlist.append(t)
                break
            elif t.start == end+1:
                nextword = t.text
            exclude_list = []
            if exclude is not None:
                for t in tlist:
                    for e in exclude:
                        if t.start >= e[0] and t.end <= e[1]-1:
                            exclude_list.append(t.tid)
            tlist = [t for t in tlist if t.tid not in exclude_list]
        if tlist:
            if exclude is not None:
                newtext = self.text[tlist[0].start:exclude[0][0]]
                #print self.text[exclude[0][0]:exclude[0][1]], exclude
                last_exclude = exclude[0]
                for e in exclude[1:]:
                    if not self.text[e[1]].isspace() and not newtext[-1].isspace():
                        newtext += " "
                    newtext += self.text[last_exclude[1]:e[0]]
                    last_exclude = e
                if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace():
                    newtext += " "
                newtext += self.text[exclude[-1][1]:tlist[-1].end]
                # self.text[exclude[1]:tlist[-1].end]
            else:
                newtext = self.text[tlist[0].start:tlist[-1].end]
            if entity:
                entity.text = newtext
            if "text" in kwargs and newtext != kwargs["text"]:
                if newtext not in kwargs["text"] and kwargs["text"] not in newtext:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                                                                                   start, end, self.sid,
                                                                                                   self.text))
                    logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"]))
                    #sys.exit()
                    return None
                else:
                    logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"],
                                 start, end, self.sid, self.text))
                    return None
                    # print exclude, self.text[tlist[0].start:tlist[-1].end]
            #     print "tokens found:", [t.text for t in tlist]
                    # sys.exit()
            # else:
            # print "found the tokens!", start, end, kwargs["text"], self.sid

            if self.entities.elist.get(source):
                eid = self.sid + ".e" + str(len(self.entities.elist[source]))
            else:
                eid = self.sid + ".e0"
            if entity is None:
                if "text" in kwargs:
                    newtext = kwargs["text"]
                entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"),
                                       etype=etype, eid=eid, subtype=kwargs.get("subtype"),
                                       original_id=kwargs.get("original_id"), nextword=nextword)

                entity.normalize()
            self.entities.add_entity(entity, source)
            self.label_tokens(tlist, source, etype)
            #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid,
            #                                                                 len(self.entities.elist[source])))
            return eid
        else:
            logging.info("no tokens found:")
            logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text")))
            logging.info(str([(t.start, t.end, t.text) for t in self.tokens]))