def __nil_clustering(nom_dict_file, edl_file, dst_file): nom_names = load_nom_dict(nom_dict_file) all_mentions = Mention.load_edl_file(edl_file) nil_mentions = [m for m in all_mentions if m.kbid.startswith('NIL') and m.name.lower() not in nom_names] kbid_mentions = __group_mentions_by_kbid(nil_mentions) new_kbids, new_mentions_kbids = list(), list() for kbid, mentions in kbid_mentions.iteritems(): merged = False for nkbid, nmentions in izip(new_kbids, new_mentions_kbids): if __should_merge(mentions, nmentions): # for m in mentions: # print '%s\t' % m.name, # print # for m in nmentions: # print '%s\t' % m.name, # print '\n' for m in mentions: m.kbid = nkbid nmentions.append(m) merged = True break if not merged: new_kbids.append(kbid) new_mentions_kbids.append(mentions) Mention.save_as_edl_file(all_mentions, dst_file)
def prev_mentions_format_to_new(tab_file, xml_file, output_file): xml_text = __read_text_file(xml_file) miter = re.finditer(xml_mention_pattern_str, xml_text) mentions_dict = dict() beg_pos_dict = dict() for m in miter: cur_doc_id = m.group(3) mention = Mention(name=m.group(2), docid=cur_doc_id, mention_id=m.group(1)) doc_beg = beg_pos_dict.get(cur_doc_id, 0) # TODO mention.beg_pos = doc_beg mention.end_pos = doc_beg + len(mention.name.encode('utf-8')) - 1 beg_pos_dict[cur_doc_id] = mention.end_pos + 1 mentions_dict[mention.mention_id] = mention f = open(tab_file, 'r') for line in f: vals = line.strip().split('\t') if len(vals) < 3: continue m = mentions_dict.get(vals[0], None) if m: m.kbid = vals[1] m.entity_type = vals[2] f.close() Mention.save_as_edl_file(mentions_dict.values(), output_file)
def clean_ner_result(result_file): ord_mention_list = list() med_mention_list = list() fin = open(result_file, 'rb') for line in fin: line = line.strip() if len(line) == 0: continue vals = line.strip().split('\t') # TODO if vals[3] == 'Disease' or vals[3] == 'Chemical': span = (int(vals[0]), int(vals[1]) - 1) else: span = (int(vals[0]), int(vals[1])) mention = Mention() mention.span = span mention.mtype = vals[3] if len(vals) == 4: ord_mention_list.append(mention) else: if vals[4].startswith('MESH'): mention.mesh_id = vals[4][5:] elif vals[4].startswith('CHEBI'): mention.chebi_id = int(vals[4][6:]) med_mention_list.append(mention) fin.close() merged_mention_list = list() Mention.merge_mention_list(med_mention_list, merged_mention_list) Mention.merge_mention_list(ord_mention_list, merged_mention_list) return merged_mention_list
def main(): # dataset = 'LDC2015E75' dataset = 'LDC2015E103' # dataset = 'LDC2016E63' mentions_tag = '0' run_id = 4 # datadir = '/home/dhl/data/EDL/' datadir = 'e:/data/edl' doc_list_file = os.path.join(datadir, dataset, 'data/eng-docs-list-win.txt') mid_type_file = os.path.join(datadir, 'res/freebase/mid-entity-type.txt') cur_edl_file = os.path.join(datadir, dataset, 'output/sys-link-sm-%s.tab' % mentions_tag) miss_match_mentions_file = os.path.join(datadir, dataset, 'output/miss-match-mentions-%s.txt' % mentions_tag) new_edl_file = os.path.join(datadir, dataset, 'output/sys-link-sm-pp-ft-%d.tab' % run_id) # __nil_clustering(nom_dict_file, edl_file, dst_file) mentions = Mention.load_edl_file(cur_edl_file) # __link_nom(doc_mentions_dict, max_nil_id) __nil_author_clustering(mentions) __fix_special_types(mentions) __fix_entity_types_by_mid(mid_type_file, mentions) # __fix_type_diff_of_same_kbid(mentions) __validate_mentions(doc_list_file, mentions, miss_match_mentions_file) __fix_pos_error(mentions) Mention.save_as_edl_file(mentions, new_edl_file, runid='WednesdayGo%d' % run_id)
def __expand_location_names(mentions, tokenized_text_file, entity_candidates_dict_file): doc_mentions_dict = Mention.group_mentions_by_docid(mentions) expansion_candidates = [] f = open(tokenized_text_file, 'r') for line in f: vals = line.strip().split('\t') docid = vals[0] # print docid num_lines = int(vals[1]) doc_mentions = doc_mentions_dict[docid] # print len(mentions) for i in xrange(num_lines): line = f.next().decode('utf-8') words = line.strip().split(' ') expansion_candidates += __find_expansion_candidates_in_location_mentions( doc_mentions, words) # break # break f.close() expansion_dict = __filter_expansion_candidates( expansion_candidates, entity_candidates_dict_file) qid_mentions = Mention.group_mentions_by_qid(mentions) for qid, mention in qid_mentions.iteritems(): exp_name = expansion_dict.get(qid, '') if not exp_name: continue print '%s\t%s\t%s' % (qid, mention.name, exp_name) mention.name = exp_name
def __apply_coref(edl_file, linking_info_file, dst_edl_file): coref_dict = dict() f = open(linking_info_file, 'rb') while True: docid = ioutils.read_str_with_byte_len(f) if not docid: break num_mentions = np.fromfile(f, '>i4', 1) is_nested = np.fromfile(f, 'b', num_mentions) corefs = np.fromfile(f, '>i4', num_mentions) qids = list() for i in xrange(num_mentions): qid = __read_mention_from_linking_info_file(f) qids.append(qid) for coref_id, qid in izip(corefs, qids): if coref_id > 0: coref_dict[qid] = qids[coref_id] f.close() mentions = Mention.load_edl_file(edl_file) qid_mentions = Mention.group_mentions_by_qid(mentions) __assgin_different_id_to_all_nils(mentions) print qid_mentions['EDL14_ENG_0052'].kbid for m in mentions: if not m.kbid.startswith('NIL'): continue coref_qid = coref_dict.get(m.mention_id, '') if coref_qid: print m.mention_id, coref_qid, m.name, qid_mentions[coref_qid].kbid m.kbid = qid_mentions[coref_qid].kbid Mention.save_as_edl_file(mentions, dst_edl_file)
def link_text(self, text, mention_detection_result): result_dict = dict() mesh_mention_list = self.__find_mesh_mentions(text) merged_mention_list = list() Mention.merge_mention_list(mention_detection_result, merged_mention_list) Mention.merge_mention_list(mesh_mention_list, merged_mention_list) self.__link_mention_to_wiki(text, merged_mention_list) mesh_idx_dict, wiki_idx_dict, chebi_idx_dict, idx_list = MedLink.__asign_indices( merged_mention_list) result_dict['entities'] = entities_dict = dict() self.__add_wiki_mention_info(wiki_idx_dict, entities_dict) self.__add_mesh_mention_info(mesh_idx_dict, entities_dict) self.__add_chebi_mention_info(chebi_idx_dict, entities_dict) result_span_list = list() mention_type_list = list() for mention in merged_mention_list: result_span_list.append(mention.span) mention_type_list.append(mention.mtype) result_dict['spans'] = result_span_list result_dict['idx'] = idx_list result_dict['type'] = mention_type_list return json.dumps(result_dict, indent=2)
def __list_errors(): gold_edl_file = 'e:/data/el/LDC2015E20/data/eval/data/mentions-raw.tab' sys_edl_file = 'e:/data/el/LDC2015E20/data/eval/output/emadr-result-coref.tab' eid_wid_file = 'e:/data/el/res/eid_wid_ord_eid.txt' eid_wid_dict = load_eid_wid_file(eid_wid_file) gold_mentions = Mention.load_edl_file(gold_edl_file) gold_qid_mentions = Mention.group_mentions_by_qid(gold_mentions) sys_mentions = Mention.load_edl_file(sys_edl_file) sys_qid_mentions = Mention.group_mentions_by_qid(sys_mentions) for qid, mention in gold_qid_mentions.iteritems(): sys_mention = sys_qid_mentions[qid] if sys_mention.kbid == mention.kbid: continue if sys_mention.kbid.startswith('NIL') and mention.kbid.startswith( 'NIL'): continue if mention.kbid.startswith('NIL'): continue wid_gold = eid_wid_dict.get(mention.kbid, -1) wid_sys = eid_wid_dict.get(sys_mention.kbid, -1) print '%s\t%s\t%s\t%s\t%d\t%d\t%s' % ( qid, mention.kbid, sys_mention.kbid, mention.docid, mention.beg_pos, mention.end_pos, mention.name) print wid_gold, wid_sys
def __save_link_result(edl_file, result_triv, qids, kbids_list, y_pred, max_scores, dst_file, use_nil_thres): mentions = Mention.load_edl_file(edl_file) for m in mentions: m.kbid = 'NODEF' qid_mentions = Mention.group_mentions_by_qid(mentions) for qid, kbid in result_triv.iteritems(): qid_mentions[qid].kbid = kbid # print qid, kbid for qid, kbids, y, max_score in izip(qids, kbids_list, y_pred, max_scores): if y >= len(kbids): print y, len(kbids) if qid_mentions[qid].kbid == 'NODEF': if use_nil_thres and max_score < 0.5: qid_mentions[qid].kbid = 'NIL' else: qid_mentions[qid].kbid = kbids[y] # print qid, kbids[y] for m in mentions: if m.kbid.startswith('m.') or m.kbid.startswith('NIL'): m.kbid = 'NIL0001' Mention.save_as_edl_file(mentions, dst_file)
def __remove_leading_the(metions_file, dst_mentions_edl_file): mentions = Mention.load_edl_file(metions_file) for m in mentions: if m.name.startswith('the '): m.name = m.name[4:] m.beg_pos += 4 Mention.save_as_edl_file(mentions, dst_mentions_edl_file)
def __evaluate_edl(gold_edl_file, sys_edl_file, require_type_match, link_error_file, type_error_file): gold_mentions = Mention.load_edl_file(gold_edl_file, arrange_by_docid=True) sys_mentions = Mention.load_edl_file(sys_edl_file, arrange_by_docid=True) link_errors = list() type_errors = list() sys_cnt, gold_cnt, hit_cnt = 0, 0, 0 for docid, sys_mentions_doc in sys_mentions.iteritems(): gold_mentions_doc = gold_mentions.get(docid, list()) for gm in gold_mentions_doc: if not gm.kbid.startswith('NIL'): gold_cnt += 1 hit_list = [False for _ in xrange(len(gold_mentions_doc))] for sm in sys_mentions_doc: for i, gm in enumerate(gold_mentions_doc): if sm.beg_pos == gm.beg_pos and sm.end_pos == gm.end_pos: hit_list[i] = True break if sm.kbid.startswith('NIL'): continue sys_cnt += 1 for i, gm in enumerate(gold_mentions_doc): if sm.beg_pos != gm.beg_pos or sm.end_pos != gm.end_pos: continue if gm.mention_type == 'NOM': sys_cnt -= 1 break if sm.entity_type != gm.entity_type: type_errors.append((gm, sm)) if sm.kbid == gm.kbid and ((not require_type_match) or sm.entity_type == gm.entity_type): hit_cnt += 1 if sm.kbid != gm.kbid: link_errors.append((gm, sm)) # print '%s\t%s\t%s\t%s' % (docid, sm.mid, gm.mid, gm.name) # print sm.mid, gm.mid, gm.name, docid link_errors.sort(key=lambda x: x[0].name) __write_link_errors(link_errors, link_error_file) type_errors.sort(key=lambda x: x[0].name) __write_type_errors(type_errors, type_error_file) # for v in errors: # print '%s\t%s\t%s\t%s' % (v[0], v[1], v[2], v[3]) print '#hit: %d, #sys: %d, #gold: %d' % (hit_cnt, sys_cnt, gold_cnt) hit_cnt = float(hit_cnt) prec = hit_cnt / sys_cnt recall = hit_cnt / gold_cnt f1 = 2 * prec * recall / (prec + recall) print 'prec: %f, recall: %f, f1: %f' % (prec, recall, f1)
def __find_mesh_mentions(self, text): mesh_spans, mesh_ids = self.mesh_match.find_all_terms(text) mention_list = list() for mesh_span, mesh_id in izip(mesh_spans, mesh_ids): mention = Mention() mention.span = mesh_span mention.mtype = 'MISC' mention.mesh_id = mesh_id mention_list.append(mention) return mention_list
def __find_type_errors(gold_edl_file, sys_edl_file): gold_mentions_docs = Mention.load_edl_file(gold_edl_file, arrange_by_docid=True) sys_mentions_docs = Mention.load_edl_file(sys_edl_file, arrange_by_docid=True) all_errors = list() for docid, sys_mentions in sys_mentions_docs.iteritems(): gold_mentions = gold_mentions_docs[docid] all_errors += __find_type_errors_of_docs(docid, gold_mentions, sys_mentions) all_errors.sort(key=lambda x: x[0].name.lower()) for v in all_errors: print '%s\t%s\t%s\t%s' % (v[0].name, v[0].entity_type, v[1].entity_type, v[0].docid)
def __extract_nom_mentions(nom_dict_file, doc_list_file, words_pos_file, dst_nom_mentions_file): noms = load_nom_dict(nom_dict_file) nom_name_list = [n for n in noms] nom_name_list.sort(key=lambda x: -len(x)) nom_name_list = [n.split(' ') for n in nom_name_list] doc_path_dict = __load_doc_paths_as_dict(doc_list_file) mentions = list() f_wp = open(words_pos_file, 'r') for i, line in enumerate(f_wp): vals = line.rstrip().split('\t') docid = vals[0] if (i + 1) % 10 == 0: print i + 1, docid doc_path = doc_path_dict[docid] doc_text = read_text(doc_path).decode('utf-8') if doc_text.startswith(doc_head): doc_text = doc_text[len(doc_head):] num_sentences = int(vals[1]) for j in xrange(num_sentences): sentence = __next_sentence_in_words_pos_file(f_wp) words = [tup[0].lower() for tup in sentence] # print words hit_spans, hit_indices = find_phrases_in_words(nom_name_list, words, False) for hit_span, hit_idx in izip(hit_spans, hit_indices): beg_pos = sentence[hit_span[0]][3] end_pos = sentence[hit_span[1] - 1][4] tags = [tup[2] for tup in sentence[hit_span[0]:hit_span[1]]] # print tags # if 'NN' not in tags and 'NNP' not in tags: # continue if 'NN' not in tags: continue name = doc_text[beg_pos:end_pos + 1].replace('\n', ' ') if '<' in name or 'http:' in name or '>' in name: continue m = Mention(name=name, beg_pos=beg_pos, end_pos=end_pos, docid=docid, mention_type='NOM', entity_type='PER', kbid='NIL00000') mentions.append(m) # print sentence[hit_span[0]], sentence[hit_span[1]] # print nom_name_list[hit_idx], name # break f_wp.close() Mention.save_as_edl_file(mentions, dst_nom_mentions_file)
def __gen_ttl_dict(): edl_file = '/home/dhl/data/EDL/LDC2015E103/data/gold-eng-mentions.tab' dst_file = '/home/dhl/data/EDL/LDC2015E75/data/ttl-dict.txt' mentions = Mention.load_edl_file(edl_file) for m in mentions: if m.entity_type == 'TTL': print m.name, m.entity_type, m.mention_type, m.docid
def _decode(mention_json): """ Decode a json string of a sentence. e.g., {"senid":40, "mentions":[{"start":0,"end":2,"labels":["/person"]}, {"start":6,"end":8,"labels":["/location/city","/location"]}], "tokens":["Raymond","Jung",",","51",",","of","Federal","Way",";", "accused","of","leasing","apartments","where","the","women", "were","housed","."], "fileid":""} :param mention_json: string :return: a sentence instance with all mentions appearing in this sentence """ if mention_json == '': return None decoded = json.loads(mention_json) sentence = Sentence(decoded['fileid'], decoded['senid'], decoded['tokens']) for m in decoded['mentions']: sentence.add_mention(Mention(int(m['start']), int(m['end']), m['labels']," ".join(decoded['tokens'][m['start']:m['end']]))) if 'pos' in decoded: sentence.pos = decoded['pos'] if 'dep' in decoded: for dep in decoded['dep']: sentence.dep.append((dep['type'], dep['gov'], dep['dep'])) return sentence
def from_coref(cls, coref): check_type(coref, document.Coreference) mentions = [ Mention.from_mention(mention) for mention in coref.mentions ] return cls(mentions)
def __type_eval(): tac_edl_file = 'e:/el/LDC2015E75/data/tac_kbp_2015_tedl_training_gold_fixed.tab' mid_type_file = 'e:/el/res/freebase/mid-entity-type.txt' mid_type_dict = dict() f = open(mid_type_file, 'r') for line in f: vals = line[:-1].split('\t') mid_type_dict[vals[0]] = vals[1] f.close() hitcnt, cnt = 0, 0 mentions = Mention.load_edl_file(tac_edl_file) for m in mentions: if not m.mid.startswith('m.'): continue # print m.mid cnt += 1 sys_type = mid_type_dict.get(m.mid[2:], 'ORG') if sys_type == m.entity_type: hitcnt += 1 else: print m.mid, m.entity_type, sys_type print hitcnt, cnt print float(hitcnt) / cnt
def __validate_mentions(doc_list_file, mentions, dst_miss_match_file): print 'checking miss match' doc_mentions = Mention.arrange_mentions_by_docid(mentions) doc_paths = load_doc_paths(doc_list_file) doc_head = '<?xml version="1.0" encoding="utf-8"?>\n' miss_match_cnt = 0 fout = open(dst_miss_match_file, 'wb') for doc_path in doc_paths: docid = doc_id_from_path(doc_path) cur_doc_mentions = doc_mentions.get(docid, list()) if not cur_doc_mentions: continue doc_text = read_text(doc_path, True) if doc_text.startswith(doc_head): doc_text = doc_text[len(doc_head):] for m in cur_doc_mentions: name_in_doc = doc_text[m.beg_pos:m.end_pos + 1] if name_in_doc != m.name: miss_match_cnt += 1 fout.write('%s\t%s\t%d\t%d\t%s\n' % (docid, m.name.encode('utf-8'), m.beg_pos, m.end_pos, name_in_doc.encode('utf-8'))) # print '%s\t%s\t%d\t%d\t%s' % (docid, m.name, m.beg_pos, m.end_pos, name_in_doc) fout.close() print miss_match_cnt, 'miss match'
def collect_mentions(self): mention_ids = defaultdict(list) def get_start_ids(cr): return [ int(x.replace(')', '').replace('(', '')) for x in cr.split('|') if x.startswith('(') ] def get_end_ids(cr): return [ int(x.replace(')', '').replace('(', '')) for x in cr.split('|') if x.endswith(')') ] starts = [(i, t) for (i, t) in enumerate(self.tokens) if t.coref.find('(') > -1] starts.reverse() ends = [(i, t) for (i, t) in enumerate(self.tokens) if t.coref.find(')') > -1] for s in starts: ids = get_start_ids(s[1].coref) for i in ids: mention_ids[i].append(s) for e in ends: ids = get_end_ids(e[1].coref) for i in ids: s = mention_ids[i].pop() self.mentions.append( Mention(self.tokens[s[0]:e[0] + 1], self.sentenceID, (s[0], e[0]), i))
def __check_mention_fb_types(): tac_edl_file = 'e:/el/LDC2015E75/data/tac_kbp_2015_tedl_training_gold_fixed.tab' fb_type_file = 'e:/el/res/freebase/mid-fb-type.gz' result_file = 'e:/el/LDC2015E75/data/mention-fb-types.txt' mentions = Mention.load_edl_file(tac_edl_file) mid_mentions = dict() for m in mentions: if m.mid.startswith('NIL'): continue mid_mentions[m.mid[2:]] = m f = gzip.open(fb_type_file, 'r') fout = open(result_file, 'wb') for i, line in enumerate(f): vals = line[:-1].split('\t') m = mid_mentions.get(vals[0], None) if m: # print '%s\t%s\t%s\t%s' % (m.name, vals[0], m.entity_type, vals[1]) fout.write('%s\t%s\t%s\t%s\n' % (m.name.encode('utf-8'), vals[0], m.entity_type, vals[1])) if (i + 1) % 1000000 == 0: print i + 1 f.close() fout.close()
def __gold_mention_insight(): edl_gold_file = 'e:/el/LDC2015E103/data/tac_kbp_2015_tedl_evaluation_gold_standard_entity_mentions.tab' mentions = Mention.load_edl_file(edl_gold_file) doc_mention_dict = dict() for m in mentions: if m.docid.startswith('ENG'): mlist = doc_mention_dict.get(m.docid, list()) if not mlist: doc_mention_dict[m.docid] = mlist mlist.append(m) cnt, fncnt = 0, 0 for docid, doc_mentions in doc_mention_dict.iteritems(): print docid for m0 in doc_mentions: if m0.entity_type == 'PER' and ' ' in m0.name: fncnt += 1 for m1 in doc_mentions: if m0 == m1: continue if m0.beg_pos <= m1.beg_pos and m0.end_pos >= m1.end_pos and m0.entity_type == 'PER': print '\t%s\t%d\t%d' % (m0.name, m0.beg_pos, m0.end_pos) print '\t%s\t%d\t%d' % (m1.name, m1.beg_pos, m1.end_pos) cnt += 1 # print m0.name, m0.beg_pos, m0.end_pos # print m1.name, m1.beg_pos, m1.end_pos print cnt, fncnt
def __evaluate_ed(gold_edl_file, sys_edl_file, fn_file, fp_file, require_type_match=True): gold_mentions = Mention.load_edl_file(gold_edl_file, arrange_by_docid=True) sys_mentions = Mention.load_edl_file(sys_edl_file, arrange_by_docid=True) fout_fp = open(fp_file, 'wb') sys_cnt, gold_cnt, hit_cnt = 0, 0, 0 fn_mentions = list() for docid, sys_mentions_doc in sys_mentions.iteritems(): sys_cnt += len(sys_mentions_doc) all_gold_mentions_in_doc = gold_mentions.get(docid, list()) # nam_gold_mentions = [m for m in all_gold_mentions_in_doc if m.mention_type == 'NAM'] nam_gold_mentions = all_gold_mentions_in_doc gold_hit_tags = [False] * len(nam_gold_mentions) gold_cnt += len(nam_gold_mentions) for sm in sys_mentions_doc: hit = False for i, gm in enumerate(nam_gold_mentions): type_hit = (sm.entity_type.startswith(gm.entity_type)) if require_type_match else True if sm.beg_pos == gm.beg_pos and sm.end_pos == gm.end_pos and type_hit: hit = True hit_cnt += 1 gold_hit_tags[i] = True break if not hit: fout_fp.write('%s\t%s\t%d\t%d\n' % (sm.name.encode('utf-8'), docid, sm.beg_pos, sm.end_pos)) # break for gm, hit in izip(nam_gold_mentions, gold_hit_tags): if not hit: fn_mentions.append(gm) # fout_fn.write('%s\t%s\t%d\t%d\n' % (gm.name.encode('utf-8'), docid, gm.beg_pos, gm.end_pos)) fout_fp.close() fn_mentions.sort(key=lambda x: x.name) Mention.write_mentions(fn_mentions, fn_file) print '#hit: %d, #sys: %d, #gold: %d' % (hit_cnt, sys_cnt, gold_cnt) hit_cnt = float(hit_cnt) prec = hit_cnt / sys_cnt recall = hit_cnt / gold_cnt f1 = 2 * prec * recall / (prec + recall) print 'prec: %f, recall: %f, f1: %f' % (prec, recall, f1)
def link(): print 'beg init' med_link = init_model() curtext = '“That\'s a growth rate of 6,000 times over three years,” touts Turner.' m = Mention(span=(0, 4), mtype='PER') mentions = [m] lr = med_link.link_mentions(mentions, curtext) print __mentions_to_dict_list(lr)
def end(self, tag): self.tag = '' if tag == 'sentences': if self.parse_sent: self.parse_sent = False elif tag == 'sentence': if self.parse_sent: if self.sent is not None: self.sents.append(deepcopy(self.sent)) self.sent = None elif tag == 'token': # map corenlp ner tags to coerse grained ner tags token = Token(self.word, self.lemma, self.pos, ner=convert_corenlp_ner_tag(self.ner)) self.sent.add_token(deepcopy(token)) self.word = '' self.lemma = '' self.pos = '' self.ner = '' elif tag == 'dependencies': if self.parse_dep: self.parse_dep = False elif tag == 'dep': if self.parse_dep: if not self.copied_dep: if self.dep_label != 'root': dep = Dependency(self.dep_label, self.gov_idx, self.dep_idx, self.extra) self.sent.add_dep(deepcopy(dep)) else: self.copied_dep = False self.dep_label = '' self.gov_idx = -1 self.dep_idx = -1 self.extra = False elif tag == 'coreference': if self.parse_coref: if self.coref is not None: self.corefs.append(deepcopy(self.coref)) self.coref = None else: self.parse_coref = False elif tag == 'mention': mention = Mention(self.sent_idx, self.start_token_idx, self.end_token_idx, head_token_idx=self.head_token_idx, rep=self.rep, text=self.text.encode('ascii', 'ignore')) self.coref.add_mention(deepcopy(mention)) self.sent_idx = -1 self.start_token_idx = -1 self.end_token_idx = -1 self.head_token_idx = -1 self.rep = False self.text = ''
def get_cand_mentions(corpus, limit=5, check=False): """ :param corpus: 1D: n_doc, 2D: n_sents, 3D: n_words; elem=(doc_id, part_id, word, tag, syn, ne, coref_id) :return: cand: 1D: n_doc, 2D: n_sents, 3D: n_mentions; elem=Mention """ cand_ments = [] count = 0. max_span_len = -1 total_span_len = 0. for doc_i, doc in enumerate(corpus): doc_ments = [] for sent_i, sent in enumerate(doc): mention_spans = [] """ Extracting NP, Pro-Nom, NE mentions """ mention_spans.extend(get_np(sent)) mention_spans.extend(get_pronominals(sent)) mention_spans.extend(get_ne(sent)) """ Removing duplicates, and sorting """ mention_spans = list(set(mention_spans)) mention_spans.sort() tmp_ments = [] for span in mention_spans: span_len = span[1] - span[0] + 1 if span_len <= limit: tmp_ments.append(Mention(doc_i, sent_i, span)) if span_len > max_span_len: max_span_len = span_len total_span_len += span_len doc_ments.append(tmp_ments) count += len(tmp_ments) cand_ments.append(doc_ments) print 'Cand Mentions: %d Max Span Length: %d Avg. Span Length: %f' % ( count, max_span_len, total_span_len / count) if check: with open('cand_mentions.txt', 'w') as f: for doc, doc_ments in zip(corpus, cand_ments): for sent, sent_ments in zip(doc, doc_ments): for ment in sent_ments: print >> f, '%s' % str(ment.span) print >> f for sent_i, w in enumerate(sent): print >> f, '%d\t%s\t%s' % (sent_i, w[2].encode('utf-8'), w[-1].encode('utf-8')) print >> f return cand_ments
def __compare_mentions(): datadir = 'e:/data/edl' edl_file0 = '%s/LDC2016E63/output/ner-mentions-0.tab' % datadir edl_file1 = '%s/LDC2016E63/output/ner-mentions-1.tab' % datadir mentions0 = Mention.load_edl_file(edl_file0, True) mentions1 = Mention.load_edl_file(edl_file1, True) for docid, doc_mentions1 in mentions1.iteritems(): print docid doc_mentions0 = mentions0.get(docid, list()) for m1 in doc_mentions1: found = False for m0 in doc_mentions0: if m0.beg_pos == m1.beg_pos and m0.end_pos == m1.end_pos: found = True break if not found: print '\t%s\t%d\t%d' % (m1.name, m1.beg_pos, m1.end_pos)
def __build_training_data(qid_x_list, edl_file): mentions = Mention.load_edl_file(edl_file) qid_mentions = Mention.group_mentions_by_qid(mentions) train_x = list() train_y = list() for tup in qid_x_list: qid, kbid, first_candidate, commonness, dist = tup # print qid, kbid, first_candidate, commonness, dist m = qid_mentions[qid] if (not m.kbid.startswith('NIL')) and m.kbid != kbid: continue y = 0 if m.kbid.startswith('NIL') else 1 # train_x.append([first_candidate, commonness, dist]) train_x.append([first_candidate, commonness]) # train_x.append([first_candidate]) train_y.append(y) return train_x, train_y
def read_mentions(self): if not os.path.isfile(self.mentions_path): return with open(self.mentions_path) as f: for line in f: m = Mention.from_string(line) if not m.private: self.mentions[m.target].append(m) else: self.private_mentions[m.target].append(m)
def __gen_training_data(edl_file): mentions = Mention.load_edl_file(edl_file) nil_mentions = __get_nil_mentions(mentions) kbid_mentions = Mention.group_mentions_by_kbid(nil_mentions) pos_samples = __gen_positive_samples(kbid_mentions) neg_samples = __gen_neg_samples(kbid_mentions, len(pos_samples)) data_x = list() data_y = list() all_samples = __merge_samples(pos_samples, neg_samples) for sample, y, in all_samples: sample_x = __get_features(sample) data_x.append(sample_x) data_y.append(y) # print for x, y in izip(data_x, data_y): print x, y return data_x, data_y
def __get_mid_types_in_dataset(): datadir = 'e:/data/edl' edl_file = os.path.join(datadir, 'LDC2015E75/data/gold-eng-mentions.tab') mid_types_file = os.path.join(datadir, 'res/freebase/mid-fb-type.gz') dst_file = os.path.join(datadir, 'LDC2015E75/output/fb-types.txt') mentions = Mention.load_edl_file(edl_file) for m in mentions: if m.kbid.startswith('m.'): m.kbid = m.kbid[2:] kbid_mentions = Mention.group_mentions_by_kbid(mentions) f = gzip.open(mid_types_file, 'r') fout = open(dst_file, 'wb') hit = False prev_kbid = '' for i, line in enumerate(f): tab_pos = line.find('\t') kbid = line[:tab_pos] if hit and prev_kbid == kbid: fout.write('\t%s' % line) elif prev_kbid != kbid: if kbid in kbid_mentions: cur_mentions = kbid_mentions[kbid] for m in cur_mentions: fout.write('%s\t' % m.name.encode('utf-8')) fout.write('\n') for m in cur_mentions: fout.write('%s\t' % m.entity_type) fout.write('\n\t%s' % line) hit = True else: hit = False prev_kbid = kbid if (i + 1) % 10000000 == 0: print i + 1 f.close() fout.close()
def link_mentions_info(self, text, mention_detection_result, find_mesh_mentions_by_dict=False): merged_mention_list = list() if find_mesh_mentions_by_dict: mesh_mention_list = self.__find_mesh_mentions(text) Mention.merge_mention_list(mention_detection_result, merged_mention_list) Mention.merge_mention_list(mesh_mention_list, merged_mention_list) linked_mentions = self.link_mentions(merged_mention_list, text) if find_mesh_mentions_by_dict: for mention in merged_mention_list: if mention.mesh_id or mention.chebi_id > -1: for mention1 in merged_mention_list: if mention.name.lower() == mention1.name.lower(): mention1.mesh_id = mention.mesh_id mention1.chebi_id = mention.chebi_id mesh_idx_dict, wiki_idx_dict, chebi_idx_dict, idx_list = MedLink.__asign_indices(linked_mentions) # print wiki_idx_dict result_dict = dict() result_dict['entities'] = entities_dict = dict() self.__add_wiki_mention_info(wiki_idx_dict, entities_dict) self.__add_mesh_mention_info(mesh_idx_dict, entities_dict) self.__add_chebi_mention_info(chebi_idx_dict, entities_dict) result_span_list = list() mention_type_list = list() for mention in linked_mentions: result_span_list.append(mention.span) mention_type_list.append(mention.mtype) self.__fix_types(mesh_idx_dict, idx_list, mention_type_list) result_dict['spans'] = result_span_list result_dict['idx'] = idx_list result_dict['type'] = mention_type_list return json.dumps(result_dict, indent=2)
def get_gold_ments(doc_i, sent_i, sent): """ :param sent: 1D: n_words; elem=(doc_id, part_id, word, tag, syn, ne, coref) :return: ments: 1D: n_mentions: elem=Mention """ ments = [] prev = [] for i, w in enumerate(sent): mentions = w[6].split('|') for mention in mentions: if mention.startswith('('): if mention.endswith(')'): span = (i, i) coref_id = int(mention[1:-1]) ments.append(Mention(doc_i, sent_i, span, coref_id)) else: coref_id = int(mention[1:]) prev.append(((i, i), coref_id)) else: if mention.endswith(')'): coref_id = int(mention[:-1]) for j, p in enumerate(prev): if coref_id == p[1]: span = (p[0][0], i) ments.append(Mention(doc_i, sent_i, span, coref_id)) prev.pop(j) break else: print 'Error at extract_mentions(): %s' % str(sent) exit() assert len(prev) == 0 return ments
def __missing_docs_in_edl_file(): datadir = 'e:/data/edl' edl_file = '%s/LDC2016E63/output/all-mentions.tab' % datadir doc_list_file = '%s/LDC2016E63/data/eng-docs-list-win.txt' % datadir mentions = Mention.load_edl_file(edl_file) docids = set() for m in mentions: docids.add(m.docid) f = open(doc_list_file, 'r') for line in f: doc_path = line.rstrip() docid = doc_id_from_path(doc_path) if docid not in docids: print docid f.close()
def get_review_for_user(username, user_rev_idx): if user_rev_idx < 1: user_rev_idx = 1 received = __query_review_dispatcher(username, user_rev_idx) rev_idx = received['review_idx'] rev_id = received['review_id'] if rev_id == 'NULL': return None res = es.get(index=index_name, doc_type=rev_doc_type, id=rev_id) mention_dicts = received['mentions'] mentions = [Mention.from_dict(mdict) for mdict in mention_dicts] mentions.sort(key=lambda m: m.begpos) return rev_idx, res['_source'], mentions
def edl_api(): doc_text = '' if 'text' in request.values: doc_text = request.values['text'] # print doc_text # print type(doc_text) else: abort(400) json_result = '[]' try: mentions_list = list() mentions_dict = mention_extraction_web(doc_text) for result_type, mentions in mentions_dict.items(): entity_type = 'MISC' if result_type == 'results_Disease': entity_type = 'Disease' elif result_type == 'results_Chemical': entity_type = 'Chemical' for dict_mention in mentions: beg_pos = dict_mention['startChar'] end_pos = dict_mention['endChar'] meshid = None specified_type = dict_mention.get('label', None) if specified_type: entity_type = specified_type # print dict_mention # print beg_pos, end_pos, entity_type, meshid m = Mention(span=(beg_pos, end_pos), mtype=entity_type, mesh_id=meshid) mentions_list.append(m) # linked_mentions = med_link.link_mentions(mentions_list, doc_text.decode('utf-8')) linked_mentions = med_link.link_mentions(mentions_list, doc_text) json_result = json.dumps(__mentions_to_dict_list(linked_mentions)) except: print 'except' print json_result + '\n' return json_result
def __merge_mentions(mention_file_list, dst_result_file): mention_spans_docs = dict() fout = open(dst_result_file, 'wb') mention_id = 1 for mention_file in mention_file_list: mentions = Mention.load_edl_file(mention_file) for m in mentions: mention_span = (m.beg_pos, m.end_pos) mention_spans = mention_spans_docs.get(m.docid, set()) if not mention_spans: mention_spans_docs[m.docid] = mention_spans if mention_span in mention_spans: continue mention_spans.add(mention_span) m.mention_id = 'EDL_%07d' % mention_id # if m.entity_type.startswith('PER'): # m.entity_type = 'PER' m.to_edl_file(fout) mention_id += 1 fout.close()
def all_to_all(edl_file, dst_edl_file): mentions = Mention.load_edl_file(edl_file) __assgin_different_id_to_all_nils(mentions) Mention.save_as_edl_file(mentions, dst_edl_file)
def from_text(cls, text): mentions = [ Mention.from_text(mention_text.strip()) for mention_text in text.split(' :: ') ] return cls(mentions)
def produce_mention(self, serif_doc, serif_mention): mention = Mention(serif_mention.entity_type, serif_mention.mention_type.value, serif_mention.text, serif_mention.head.text, serif_doc.docid, serif_mention.syn_node.start_char, serif_mention.syn_node.end_char, serif_mention.head.start_char, serif_mention.head.end_char, serif_mention.sent_no) return mention
def __name_expansion(edl_mentions_file, doc_ner_file, tokenized_text_file, entity_candidates_dict_file, dst_file): mentions = Mention.load_edl_file(edl_mentions_file) __expand_name_with_ner_result(mentions, doc_ner_file) # __expand_location_names(mentions, tokenized_text_file, entity_candidates_dict_file) Mention.save_as_edl_file(mentions, dst_file)
def __el_stat(): data_file = 'e:/data/emadr/el/tac/2009/eval/el-2009-eval-expansion-nloc-3.bin' gold_file = 'e:/data/el/LDC2015E19/data/2009/eval/data/mentions-raw.tab' # data_file = 'e:/data/emadr/el/tac/2011/eval/el-2011-eval-expansion-all-3.bin' # gold_file = 'e:/data/el/LDC2015E19/data/2011/eval/data/mentions-expansion-all.tab' # data_file = 'e:/data/emadr/el/tac/2014/eval/el-2014-eval-raw-%d.bin' % 3 # gold_file = 'e:/data/el/LDC2015E20/data/eval/data/mentions-raw.tab' eid_wid_file = 'e:/data/el/res/eid_wid_ord_eid.txt' keep_nil = True only_show_not_in_candidate = False eid_wid_dict = load_eid_wid_file(eid_wid_file) # gold_el_result = load_gold_el(gold_file) mentions = Mention.load_edl_file(gold_file) qid_mention_dict = Mention.group_mentions_by_qid(mentions) docs_info, dim = load_docs_info(data_file) error_list = list() num_mentions, nil_mentions = 0, 0 nil_hit_cnt, id_hit_cnt = 0, 0 for doc in docs_info: docid, docvec, mentions = doc for mention in mentions: (qid, kbids, commonnesses, vecs) = mention gold_mention = qid_mention_dict[qid] gold_id = gold_mention.kbid gold_id_is_nil = gold_id.startswith('NIL') if gold_id_is_nil: nil_mentions += 1 if not keep_nil and gold_id_is_nil: continue num_mentions += 1 indices, legal_kbids = __get_legal_kbids(kbids, keep_nil) if gold_id_is_nil and (len(legal_kbids) == 0 or legal_kbids[0].startswith('m.')): nil_hit_cnt += 1 continue first_kbid = legal_kbids[0] if legal_kbids else 'NIL' if first_kbid == gold_id: id_hit_cnt += 1 continue error_list.append( (qid, docid, gold_mention.name, gold_id, legal_kbids)) error_list.sort(key=lambda x: x[2]) for e in error_list: qid, docid, name, gold_id, legal_kbids = e gold_wid = eid_wid_dict.get(gold_id, -1) in_candidates = gold_id in legal_kbids if only_show_not_in_candidate and in_candidates: continue # if not in_candidates: # print 'not found' print '%s\t%s\t%s\t%s_%d' % (qid, docid, name, gold_id, gold_wid) # for eid in legal_kbids: # wid = eid_wid_dict.get(eid, -1) # print '\t%s_%d' % (eid, wid), # print print id_hit_cnt, num_mentions print 'INKB: %f' % (float(id_hit_cnt) / (num_mentions - nil_mentions)) print 'TOTAL: %f' % (float(id_hit_cnt + nil_hit_cnt) / num_mentions)