def get_dic(self, source): dic = {} dic["id"] = self.sid dic["offset"] = str(self.tokens[0].dstart) dic["text"] = self.text dic["entities"] = [] if source in self.entities.elist: for entity in self.entities.elist[source]: dic["entities"].append(entity.get_dic()) dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset']) for ei, e in enumerate(dic["entities"]): e["eid"] = self.sid + ".e{}".format(ei) elif source == "all": offsets = Offsets() for esource in self.entities.elist: for entity in self.entities.elist[esource]: toadd, v, overlapping, to_exclude = offsets.add_offset(Offset(entity.start, entity.end), exclude_this_if=[1, -1, 2, -3], exclude_others_if=[2]) if toadd: dic["entities"].append(entity.get_dic()) dic["entities"] = sorted(dic["entities"], key=lambda k: k['offset']) for ei, e in enumerate(dic["entities"]): e["eid"] = self.sid + ".e{}".format(ei) dic["pairs"] = self.pairs.get_dic() return dic
def get_entity_offsets(self, esource, ths, rules): spans = [] offsets = Offsets() for s in self.elist: # logging.info("{}".format(s)) # logging.info("esource: {}".format(es)) if s.startswith(esource): # logging.info("using {}".format(s)) for e in self.elist[s]: val = e.validate(ths, rules) if not val: logging.info("excluded {}".format(e.text)) continue eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid) exclude = [perfect_overlap] if "contained_by" in rules: exclude.append(contained_by) toadd, v, overlapped, to_exclude = offsets.add_offset(eid_offset, exclude_this_if=exclude, exclude_others_if=[]) if toadd: # extra_info = ['genia_tags:"' + "+".join([t.genia_tag for t in e.tokens]) + '"'] extra_info = [] extra_info.append('recognized_by:"' + "+".join(e.recognized_by) + '"') spans.append((e.dstart, e.dend, e.text, extra_info)) # logging.info("added {}".format(e.text)) else: logging.debug("did not add {}".format(e.text)) return spans
def combine_results(modelname, results, resultsname, etype, models): all_results = ResultsNER(resultsname) # first results are used as reference all_results.corpus = results[0].corpus for r in results: print r.path for did in r.corpus.documents: for sentence in r.corpus.documents[did].sentences: ref_sentence = all_results.corpus.documents[did].get_sentence( sentence.sid) if sentence.entities: offsets = Offsets() if modelname not in ref_sentence.entities.elist: all_results.corpus.documents[did].get_sentence( sentence.sid).entities.elist[modelname] = [] for s in sentence.entities.elist: # print s if s in models: # print s for e in sentence.entities.elist[s]: if e.type == etype: eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid) exclude = [perfect_overlap] toadd, v, overlapping, to_exclude = offsets.add_offset( eid_offset, exclude_this_if=exclude, exclude_others_if=[]) if toadd: # print "added:", r.path, s, e.text ref_sentence.entities.elist[ modelname].append(e) return all_results
def get_offsets(self, esource, ths, rules): spans = [] offsets = Offsets() for s in self.elist: #print "******", s, esource # logging.info("{}".format(s)) # logging.info("esource: {}".format(es)) if s.startswith(esource): # logging.info("using {}".format(s)) for e in self.elist[s]: val = e.validate(ths, rules) if not val: logging.info("excluded {}".format(e.text)) continue eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid) exclude = [perfect_overlap] if "contained_by" in rules: exclude.append(contained_by) #print "********", eid_offset.start, eid_offset.end, eid_offset.text toadd, v, overlapped, to_exclude = offsets.add_offset( eid_offset, exclude_this_if=exclude, exclude_others_if=[]) #print toadd, v #print e.dstart, e.dend, e.text if toadd: #logging.debug("added {}".format(e.text)) spans.append((e.dstart, e.dend, e.text)) # logging.info("added {}".format(e.text)) #else: #logging.debug("did not add {}".format(e.text)) return spans
def get_offsets2(self, esource, ths, rules): spans = [] offsets = Offsets() new_entities = [] for s in self.elist: if s.startswith(esource): for e in self.elist[s]: new_entities.append(e) validated_entity = self.validate(e, ths, rules) #possibly make it as a list if validated_entity != None: eid = self.sid + ".e" + str(len(self.elist[esource])) new_entity = Entity(validated_entity[1], e.sid, text=validated_entity[0], did=e.did, #score=score, type=e.type, eid=eid) new_entity.type = e.type new_entities.append(new_entity) for new_e in new_entities: eid_offset = Offset(new_e.dstart, new_e.dend, text=new_e.text, sid=new_e.sid) exclude = [perfect_overlap] if "contained_by" in rules: exclude.append(contained_by) toadd, v, overlapped, to_exclude = offsets.add_offset(eid_offset, exclude_this_if=exclude, exclude_others_if=[]) if toadd: spans.append((new_e.dstart, new_e.dend, new_e.text)) return spans
def tag_sentence(self, sentence, entity_type="entity", offsets=None): exclude_this_if = (partial_overlap_after, partial_overlap_before, contained_by, perfect_overlap) exclude_others_if = (contains,) if not offsets: offsets = Offsets() for pattern in self.p: iterator = pattern.finditer(sentence.text) for match in iterator: offset = Offset(*match.span(2)) logging.info(match.group(2)) toadd, v, overlapping, to_exclude = offsets.add_offset(offset, exclude_this_if, exclude_others_if) if toadd: #print sentence.sid, (offset.start,offset.end), [(o.start, o.end) for o in offsets.offsets] sentence.tag_entity(offset.start, offset.end, etype=entity_type, source=self.path) for o in to_exclude: # print "excluding {}-{}".format(o.start,o.end) sentence.exclude_entity(o.start, o.end, self.path)
def tag_sentence(self, sentence, entity_type="entity", offsets=None): exclude_this_if = (partial_overlap_after, partial_overlap_before, contained_by, perfect_overlap) exclude_others_if = (contains,) if not offsets: offsets = Offsets() for pattern in self.p: iterator = pattern.finditer(sentence.text) for match in iterator: offset = Offset(*match.span(2)) logging.info(match.group(2)) toadd, v, overlapping, to_exclude = offsets.add_offset(offset, exclude_this_if, exclude_others_if) if toadd: #print sentence.sid, (offset.start,offset.end), [(o.start, o.end) for o in offsets.offsets] sentence.tag_entity(offset.start, offset.end, etype=self.etype, source=self.path) for o in to_exclude: # print "excluding {}-{}".format(o.start,o.end) sentence.exclude_entity(o.start, o.end, self.path)
def write_chemdner_results(self, source, outfile, ths={"ssm": 0.0}, rules=[], totalentities=0): """ Write results that can be evaluated with the BioCreative evaluation script :param source: Base model path :param outfile: Text Results path to be evaluated :param ths: Thresholds :param rules: Validation rules :param totalentities: Number of entities already validated on this document (for ranking) :return: """ lines = [] offsets = Offsets() rank = totalentities # print self.elist.keys() for s in self.elist: #if s != "goldstandard": # logging.info("%s - %s(%s)" % (self.sid, s, source)) if s.startswith(source): #use everything #logging.info("%s - %s" % (self.sid, s)) for e in self.elist[s]: val = e.validate(ths, rules) if not val: continue # Overlap rules eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid) exclude = [perfect_overlap] if "contained_by" in rules: exclude.append(contained_by) toadd, v, alt = offsets.add_offset(eid_offset, exclude_if=exclude) if toadd: #logging.info("added %s" % e) line = e.write_chemdner_line(outfile, rank) lines.append(line) rank += 1 return lines, rank
def get_unique_entities(self, source, ths, rules): entities = set() offsets = Offsets() for s in self.elist: if s.startswith(source): for e in self.elist[s]: val = e.validate(ths, rules) if not val: continue eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid) exclude = [perfect_overlap] if "contained_by" in rules: exclude.append(contained_by) toadd, v, alt = offsets.add_offset(eid_offset, exclude_if=exclude) if toadd: entities.add((e.text,)) return entities
def get_entity_offsets(self, esource, ths, rules, sentence_tokens): spans = [] offsets = Offsets() entity_tokens = set() to_add_entities = [] for s in self.elist: # logging.info("{}".format(s)) # logging.info("esource: {}".format(es)) if s.startswith(esource): #logging.info("using {}".format(s)) for e in self.elist[s]: val = e.validate(ths, rules) if not val: logging.info("excluded {}".format(e.text)) continue eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid) exclude = [perfect_overlap] if "contained_by" in rules: exclude.append(contained_by) toadd, v, overlapped, to_exclude = offsets.add_offset(eid_offset, exclude_this_if=exclude, exclude_others_if=[]) if toadd: to_add_entities.append(e) entity_tokens.update(set([t.order for t in e.tokens])) for e in to_add_entities: e.set_attributes(sentence_tokens, entity_tokens) # extra_info = ['genia_tags:"' + "+".join([t.genia_tag for t in e.tokens]) + '"'] extra_info = [] #extra_info.append('recognized_by:"' + "+".join(e.recognized_by) + '"') extra_info.append("left_context={}".format(e.before_context)) extra_info.append("right_context={}".format(e.after_context)) extra_info.append("left_bitmap={}".format(e.before_events)) extra_info.append("type={}".format(e.subtype)) if e.type == "event": extra_info.append("modality={}".format(e.modality)) extra_info.append("polarity={}".format(e.polarity)) extra_info.append("degree={}".format(e.degree)) spans.append((e.dstart, e.dend, e.text, extra_info)) # logging.info("added {}".format(e.text)) #else: # logging.debug("did not add {}".format(e.text)) return spans
def get_unique_entities(self, source, ths, rules): entities = {} offsets = Offsets() for s in self.elist: if s.startswith(source): for e in self.elist[s]: val = e.validate(ths, rules) if not val: continue for new_e in val: # validate should return a list of entities eid_offset = Offset(new_e.dstart, new_e.dend, text=new_e.text, sid=new_e.sid) exclude = [] if "contained_by" in rules: exclude.append(contained_by) toadd, v, overlaping, to_exclude = offsets.add_offset(eid_offset, exclude_this_if=exclude, exclude_others_if=[]) # print toadd, v, overlaping, to_exclude, new_e.normalized if toadd: # entities[new_e.text] = [] entities[new_e.normalized] = [new_e.text, str(new_e.start), str(new_e.end)] # print entities return entities
def write_chemdner_results(self, source, outfile, ths={"ssm":0.0}, rules=[], totalentities=0): """ Write results that can be evaluated with the BioCreative evaluation script :param source: Base model path :param outfile: Text Results path to be evaluated :param ths: Thresholds :param rules: Validation rules :param totalentities: Number of entities already validated on this document (for ranking) :return: """ lines = [] offsets = Offsets() rank = totalentities # print self.elist.keys() for s in self.elist: #if s != "goldstandard": # logging.info("%s - %s(%s)" % (self.sid, s, source)) if s.startswith(source): #use everything #logging.info("%s - %s" % (self.sid, s)) for e in self.elist[s]: val = e.validate(ths, rules) if not val: continue # Overlap rules eid_offset = Offset(e.dstart, e.dend, text=e.text, sid=e.sid) exclude = [perfect_overlap] if "contained_by" in rules: exclude.append(contained_by) toadd, v, alt = offsets.add_offset(eid_offset, exclude_if=exclude) if toadd: #logging.info("added %s" % e) line = e.write_chemdner_line(outfile, rank) lines.append(line) rank += 1 return lines, rank