def process_sentence(self, out, sid, results): sentence = results.corpus.documents['.'.join(sid.split('.')[:-1])].get_sentence(sid) if sentence is None: print (sid) print ("not found!") print (results.corpus.documents['.'.join(sid.split('.')[:-1])]) print ([s.sid for s in results.corpus.documents['.'.join(sid.split('.')[:-1])].sentences]) sys.exit() tagged_tokens = self.tag_tokens(out, sentence) #print tagged_tokens[0][2].text, "**************************************************************************************************************" sentence.tagged = tagged_tokens new_entity = None for t in tagged_tokens: text, tag, token = t if tag.startswith("S"): single_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=text, score=1, etype=self.etype) eid = sentence.tag_entity(start=token.start, end=token.end, etype=self.etype, entity=single_entity, source=self.path) single_entity.eid = eid results.entities[eid] = single_entity # deepcopy #logging.info("new single entity: {}".format(single_entity)) elif tag.startswith("B"): new_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=text, score=1, etype=self.etype) elif tag.startswith("I"): if not new_entity: logging.info("starting with inside...") new_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=text, score=1, etype=self.etype) else: new_entity.tokens += [token] elif tag.startswith("E"): if not new_entity: new_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=text, score=1, etype=self.etype) logging.debug("started from a end: {0}".format(new_entity)) else: new_entity.tokens += [token] new_entity.text= sentence.text[new_entity.tokens[0].start:new_entity.tokens[-1].end] new_entity.end = new_entity.start + len(new_entity.text) new_entity.dend = new_entity.dstart + len(new_entity.text) #logging.info("%s end: %s" % (new_entity.sid, str(new_entity))) #logging.debug("found the end: %s", ''.join([t.text for t in new_entity.tokens])) eid = sentence.tag_entity(start=new_entity.tokens[0].start, end=new_entity.tokens[-1].end, etype=self.etype, entity=new_entity, source=self.path) new_entity.eid = eid results.entities[eid] = new_entity # deepcopy new_entity = None logging.debug("completed entity:{}".format(results.entities[eid])) return results
def process_sentence(self, predicted, isent, results): sentence = results.corpus.get_sentence(self.sids[isent]) if len(predicted) != len(sentence.tokens): print "len(predicted) != len(sentence.tokens); {}!={}".format(len(predicted), len(sentence.tokens)) sys.exit() if sentence is None: print self.sids[isent] print "not found!" sys.exit() sentence.tagged = predicted new_entity = None for it, t in enumerate(predicted): token = sentence.tokens[it] if t == "single": single_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=token.text, score=self.scores[isent][it], etype=self.etype) eid = sentence.tag_entity(start=token.start, end=token.end, etype=self.etype, entity=single_entity, source=self.path) single_entity.eid = eid results.entities[eid] = single_entity # deepcopy elif t == "start": new_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=token.text, score=self.scores[isent][it], etype=self.etype) elif t == "middle": if not new_entity: logging.info("starting with inside...") new_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=token.text, score=self.scores[isent][it], etype=self.etype) else: new_entity.tokens.append(token) new_entity.score += self.scores[isent][it] elif t == "end": if not new_entity: new_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=token.text, score=self.scores[isent][it], etype=self.etype) logging.debug("started from a end: {0}".format(new_entity)) else: new_entity.tokens.append(token) new_entity.text = sentence.text[new_entity.tokens[0].start:new_entity.tokens[-1].end] new_entity.end = new_entity.start + len(new_entity.text) new_entity.dend = new_entity.dstart + len(new_entity.text) new_entity.score += self.scores[isent][it] new_entity.score = new_entity.score/len(new_entity.tokens) #logging.info("%s end: %s" % (new_entity.sid, str(new_entity))) #logging.debug("found the end: %s", ''.join([t.text for t in new_entity.tokens])) eid = sentence.tag_entity(new_entity.tokens[0].start, new_entity.tokens[-1].end, self.etype, entity=new_entity, source=self.path) new_entity.eid = eid results.entities[eid] = new_entity # deepcopy new_entity = None #logging.debug("completed entity:{}".format(results.entities[eid])) return results
def process_sentence(self, out, sentence): sentence_entities = {} did = sentence.did sid = sentence.sid tagged_tokens = self.split_tag_tokens(out, sentence) new_entity = None for t in tagged_tokens: text, tag, token = t if tag.startswith("S"): single_entity = create_entity(tokens=[token], sid=sid, did=did, text=text, score=1, etype=self.etype) eid = sentence.tag_entity(start=token.start, end=token.end, etype=self.etype, entity=single_entity, source=self.path) single_entity.eid = eid #results.entities[eid] = single_entity # deepcopy sentence_entities[eid] = single_entity #logging.info("new single entity: {}".format(single_entity)) elif tag.startswith("B"): new_entity = create_entity(tokens=[token], sid=sid, did=did, text=text, score=1, etype=self.etype) elif tag.startswith("I"): if not new_entity: logging.info("starting with inside...") new_entity = create_entity(tokens=[token], sid=sid, did=did, text=text, score=1, etype=self.etype) else: new_entity.tokens += [token] elif tag.startswith("E"): if not new_entity: new_entity = create_entity(tokens=[token], sid=sid, did=did, text=text, score=1, etype=self.etype) logging.debug("started from a end: {0}".format(new_entity)) else: new_entity.tokens += [token] new_entity.text= sentence.text[new_entity.tokens[0].start:new_entity.tokens[-1].end] new_entity.end = new_entity.start + len(new_entity.text) new_entity.dend = new_entity.dstart + len(new_entity.text) #logging.info("%s end: %s" % (new_entity.sid, str(new_entity))) #logging.debug("found the end: %s", ''.join([t.text for t in new_entity.tokens])) eid = sentence.tag_entity(start=new_entity.tokens[0].start, end=new_entity.tokens[-1].end, etype=self.etype, entity=new_entity, source=self.path) new_entity.eid = eid #results.entities[eid] = new_entity # deepcopy sentence_entities[eid] = new_entity new_entity = None try: logging.debug("completed entity:{}".format(sentence_entities[eid])) except UnicodeDecodeError: pass return sentence_entities
def process_entity(self, line, sentence): """ Process one line of BANNER output :param line: list of elements of the line to be processed :param sentence: Sentence object associated with line :return: """ # sid, genetype, start, end, etext = line.strip().split("\t") print line sid, genetype, start, end, etext = line tokens = sentence.find_tokens_between(int(start), int(end), relativeto="sent") if tokens: new_entity = create_entity(tokens=tokens, sid=sentence.sid, did=sentence.did, text=etext, score=1, etype=self.etype) eid = sentence.tag_entity(start=new_entity.tokens[0].start, end=new_entity.tokens[-1].end, etype=self.etype, entity=new_entity, source=self.path) new_entity.eid = eid return sentence, new_entity else: logging.info("No tokens found: {}-{}".format(start, end)) logging.info(sentence.text) return sentence, None
def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None, text=None, **kwargs): """Find the tokens that match this entity. start and end are relative to the sentence. Totalchars is the offset of the sentence on the document.""" tlist = [] # print self.tokens nextword = "" for t in self.tokens: # discard tokens that intersect the entity for now # print t.start, t.end, t.text if t.start >= start and t.end <= end: tlist.append(t) elif (t.start == start and t.end > end) or (t.start < start and t.end == end): tlist.append(t) break elif t.start == end+1: nextword = t.text exclude_list = [] if exclude is not None: for t in tlist: for e in exclude: if t.start >= e[0] and t.end <= e[1]-1: exclude_list.append(t.tid) tlist = [t for t in tlist if t.tid not in exclude_list] if tlist: if exclude is not None: newtext = self.text[tlist[0].start:exclude[0][0]] #print self.text[exclude[0][0]:exclude[0][1]], exclude last_exclude = exclude[0] for e in exclude[1:]: if not self.text[e[1]].isspace() and not newtext[-1].isspace(): newtext += " " newtext += self.text[last_exclude[1]:e[0]] last_exclude = e if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace(): newtext += " " newtext += self.text[exclude[-1][1]:tlist[-1].end] # self.text[exclude[1]:tlist[-1].end] else: newtext = self.text[tlist[0].start:tlist[-1].end] if entity: entity.text = newtext if "text" in kwargs and newtext != kwargs["text"]: if newtext not in kwargs["text"] and kwargs["text"] not in newtext: logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"], start, end, self.sid, self.text)) logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"])) #sys.exit() #return None else: logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"], start, end, self.sid, self.text)) #for t in self.tokens: # print (t.start, t.end, t.text), #print #return None # print exclude, self.text[tlist[0].start:tlist[-1].end] # print "tokens found:", [t.text for t in tlist] # sys.exit() # else: # print "found the tokens!", start, end, kwargs["text"], self.sid if self.entities.elist.get(source): eid = self.sid + ".e" + str(len(self.entities.elist[source])) else: eid = self.sid + ".e0" subtype = kwargs.get("subtype", "all") if entity is None: if "text" in kwargs: newtext = kwargs["text"] kwargs["eid"] = eid entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"), etype=etype, eid=eid, subtype=kwargs.get("subtype"), original_id=kwargs.get("original_id"), nextword=nextword) entity.normalize() self.entities.add_entity(entity, source) # print self.entities.elist["goldstandard"] self.label_tokens(tlist, source, etype, subtype=subtype) #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid, # len(self.entities.elist[source]))) return eid else: logging.info("no tokens found:") logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text"))) logging.info(str([(t.start, t.end, t.text) for t in self.tokens]))
def process_sentence(self, predicted, sid, results): sentence = results.corpus.documents['.'.join( sid.split('.')[:-1])].get_sentence(sid) if len(predicted) != len(sentence.tokens): print "len(predicted) != len(sentence.tokens); {}!={}".format( len(predicted), len(sentence.tokens)) sys.exit() if sentence is None: print sid print "not found!" sys.exit() sentence.tagged = predicted new_entity = None for it, t in enumerate(predicted): token = sentence.tokens[it] if t == "single": single_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=token.text, score=1, etype=self.etype) eid = sentence.tag_entity(start=token.start, end=token.end, etype=self.etype, entity=single_entity, source=self.path) single_entity.eid = eid results.entities[eid] = single_entity # deepcopy elif t == "start": new_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=token.text, score=1, etype=self.etype) elif t == "middle": if not new_entity: logging.info("starting with inside...") new_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=token.text, score=1, etype=self.etype) else: new_entity.tokens.append(token) elif t == "end": if not new_entity: new_entity = create_entity(tokens=[token], sid=sentence.sid, did=sentence.did, text=token.text, score=1, etype=self.etype) logging.debug("started from a end: {0}".format(new_entity)) else: new_entity.tokens.append(token) new_entity.text = sentence.text[ new_entity.tokens[0].start:new_entity.tokens[-1].end] new_entity.end = new_entity.start + len(new_entity.text) new_entity.dend = new_entity.dstart + len(new_entity.text) #logging.info("%s end: %s" % (new_entity.sid, str(new_entity))) #logging.debug("found the end: %s", ''.join([t.text for t in new_entity.tokens])) eid = sentence.tag_entity(new_entity.tokens[0].start, new_entity.tokens[-1].end, self.etype, entity=new_entity, source=self.path) new_entity.eid = eid results.entities[eid] = new_entity # deepcopy new_entity = None #logging.debug("completed entity:{}".format(results.entities[eid])) return results
def tag_entity(self, start, end, etype, entity=None, source="goldstandard", exclude=None, **kwargs): """Find the tokens that match this entity. start and end are relative to the sentence. Totalchars is the offset of the sentence on the document.""" tlist = [] # print self.tokens nextword = "" for t in self.tokens: # discard tokens that intersect the entity for now # print t.start, t.end, t.text if t.start >= start and t.end <= end: tlist.append(t) elif (t.start == start and t.end > end) or (t.start < start and t.end == end): tlist.append(t) break elif t.start == end+1: nextword = t.text exclude_list = [] if exclude is not None: for t in tlist: for e in exclude: if t.start >= e[0] and t.end <= e[1]-1: exclude_list.append(t.tid) tlist = [t for t in tlist if t.tid not in exclude_list] if tlist: if exclude is not None: newtext = self.text[tlist[0].start:exclude[0][0]] #print self.text[exclude[0][0]:exclude[0][1]], exclude last_exclude = exclude[0] for e in exclude[1:]: if not self.text[e[1]].isspace() and not newtext[-1].isspace(): newtext += " " newtext += self.text[last_exclude[1]:e[0]] last_exclude = e if not self.text[exclude[-1][1]].isspace() and not newtext[-1].isspace(): newtext += " " newtext += self.text[exclude[-1][1]:tlist[-1].end] # self.text[exclude[1]:tlist[-1].end] else: newtext = self.text[tlist[0].start:tlist[-1].end] if entity: entity.text = newtext if "text" in kwargs and newtext != kwargs["text"]: if newtext not in kwargs["text"] and kwargs["text"] not in newtext: logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"], start, end, self.sid, self.text)) logging.info("text does not match: {}=>{}".format(newtext, kwargs["text"])) #sys.exit() return None else: logging.info("diferent text:|system {} {} |{}|=>|{}| {} {} input|{} {}".format(tlist[0].start, tlist[-1].end, newtext, kwargs["text"], start, end, self.sid, self.text)) return None # print exclude, self.text[tlist[0].start:tlist[-1].end] # print "tokens found:", [t.text for t in tlist] # sys.exit() # else: # print "found the tokens!", start, end, kwargs["text"], self.sid if self.entities.elist.get(source): eid = self.sid + ".e" + str(len(self.entities.elist[source])) else: eid = self.sid + ".e0" if entity is None: if "text" in kwargs: newtext = kwargs["text"] entity = create_entity(tlist, self.sid, did=self.did, text=newtext, score=kwargs.get("score"), etype=etype, eid=eid, subtype=kwargs.get("subtype"), original_id=kwargs.get("original_id"), nextword=nextword) entity.normalize() self.entities.add_entity(entity, source) self.label_tokens(tlist, source, etype) #logging.debug("added {} to {}, now with {} entities".format(newtext, self.sid, # len(self.entities.elist[source]))) return eid else: logging.info("no tokens found:") logging.info("{} {} {} {}".format(self.sid, start, end, kwargs.get("text"))) logging.info(str([(t.start, t.end, t.text) for t in self.tokens]))