def __list_errors(): gold_edl_file = 'e:/data/el/LDC2015E20/data/eval/data/mentions-raw.tab' sys_edl_file = 'e:/data/el/LDC2015E20/data/eval/output/emadr-result-coref.tab' eid_wid_file = 'e:/data/el/res/eid_wid_ord_eid.txt' eid_wid_dict = load_eid_wid_file(eid_wid_file) gold_mentions = Mention.load_edl_file(gold_edl_file) gold_qid_mentions = Mention.group_mentions_by_qid(gold_mentions) sys_mentions = Mention.load_edl_file(sys_edl_file) sys_qid_mentions = Mention.group_mentions_by_qid(sys_mentions) for qid, mention in gold_qid_mentions.iteritems(): sys_mention = sys_qid_mentions[qid] if sys_mention.kbid == mention.kbid: continue if sys_mention.kbid.startswith('NIL') and mention.kbid.startswith( 'NIL'): continue if mention.kbid.startswith('NIL'): continue wid_gold = eid_wid_dict.get(mention.kbid, -1) wid_sys = eid_wid_dict.get(sys_mention.kbid, -1) print '%s\t%s\t%s\t%s\t%d\t%d\t%s' % ( qid, mention.kbid, sys_mention.kbid, mention.docid, mention.beg_pos, mention.end_pos, mention.name) print wid_gold, wid_sys
def __expand_location_names(mentions, tokenized_text_file, entity_candidates_dict_file): doc_mentions_dict = Mention.group_mentions_by_docid(mentions) expansion_candidates = [] f = open(tokenized_text_file, 'r') for line in f: vals = line.strip().split('\t') docid = vals[0] # print docid num_lines = int(vals[1]) doc_mentions = doc_mentions_dict[docid] # print len(mentions) for i in xrange(num_lines): line = f.next().decode('utf-8') words = line.strip().split(' ') expansion_candidates += __find_expansion_candidates_in_location_mentions( doc_mentions, words) # break # break f.close() expansion_dict = __filter_expansion_candidates( expansion_candidates, entity_candidates_dict_file) qid_mentions = Mention.group_mentions_by_qid(mentions) for qid, mention in qid_mentions.iteritems(): exp_name = expansion_dict.get(qid, '') if not exp_name: continue print '%s\t%s\t%s' % (qid, mention.name, exp_name) mention.name = exp_name
def __apply_coref(edl_file, linking_info_file, dst_edl_file): coref_dict = dict() f = open(linking_info_file, 'rb') while True: docid = ioutils.read_str_with_byte_len(f) if not docid: break num_mentions = np.fromfile(f, '>i4', 1) is_nested = np.fromfile(f, 'b', num_mentions) corefs = np.fromfile(f, '>i4', num_mentions) qids = list() for i in xrange(num_mentions): qid = __read_mention_from_linking_info_file(f) qids.append(qid) for coref_id, qid in izip(corefs, qids): if coref_id > 0: coref_dict[qid] = qids[coref_id] f.close() mentions = Mention.load_edl_file(edl_file) qid_mentions = Mention.group_mentions_by_qid(mentions) __assgin_different_id_to_all_nils(mentions) print qid_mentions['EDL14_ENG_0052'].kbid for m in mentions: if not m.kbid.startswith('NIL'): continue coref_qid = coref_dict.get(m.mention_id, '') if coref_qid: print m.mention_id, coref_qid, m.name, qid_mentions[coref_qid].kbid m.kbid = qid_mentions[coref_qid].kbid Mention.save_as_edl_file(mentions, dst_edl_file)
def __save_link_result(edl_file, result_triv, qids, kbids_list, y_pred, max_scores, dst_file, use_nil_thres): mentions = Mention.load_edl_file(edl_file) for m in mentions: m.kbid = 'NODEF' qid_mentions = Mention.group_mentions_by_qid(mentions) for qid, kbid in result_triv.iteritems(): qid_mentions[qid].kbid = kbid # print qid, kbid for qid, kbids, y, max_score in izip(qids, kbids_list, y_pred, max_scores): if y >= len(kbids): print y, len(kbids) if qid_mentions[qid].kbid == 'NODEF': if use_nil_thres and max_score < 0.5: qid_mentions[qid].kbid = 'NIL' else: qid_mentions[qid].kbid = kbids[y] # print qid, kbids[y] for m in mentions: if m.kbid.startswith('m.') or m.kbid.startswith('NIL'): m.kbid = 'NIL0001' Mention.save_as_edl_file(mentions, dst_file)
def __build_training_data(qid_x_list, edl_file): mentions = Mention.load_edl_file(edl_file) qid_mentions = Mention.group_mentions_by_qid(mentions) train_x = list() train_y = list() for tup in qid_x_list: qid, kbid, first_candidate, commonness, dist = tup # print qid, kbid, first_candidate, commonness, dist m = qid_mentions[qid] if (not m.kbid.startswith('NIL')) and m.kbid != kbid: continue y = 0 if m.kbid.startswith('NIL') else 1 # train_x.append([first_candidate, commonness, dist]) train_x.append([first_candidate, commonness]) # train_x.append([first_candidate]) train_y.append(y) return train_x, train_y
def __el_stat(): data_file = 'e:/data/emadr/el/tac/2009/eval/el-2009-eval-expansion-nloc-3.bin' gold_file = 'e:/data/el/LDC2015E19/data/2009/eval/data/mentions-raw.tab' # data_file = 'e:/data/emadr/el/tac/2011/eval/el-2011-eval-expansion-all-3.bin' # gold_file = 'e:/data/el/LDC2015E19/data/2011/eval/data/mentions-expansion-all.tab' # data_file = 'e:/data/emadr/el/tac/2014/eval/el-2014-eval-raw-%d.bin' % 3 # gold_file = 'e:/data/el/LDC2015E20/data/eval/data/mentions-raw.tab' eid_wid_file = 'e:/data/el/res/eid_wid_ord_eid.txt' keep_nil = True only_show_not_in_candidate = False eid_wid_dict = load_eid_wid_file(eid_wid_file) # gold_el_result = load_gold_el(gold_file) mentions = Mention.load_edl_file(gold_file) qid_mention_dict = Mention.group_mentions_by_qid(mentions) docs_info, dim = load_docs_info(data_file) error_list = list() num_mentions, nil_mentions = 0, 0 nil_hit_cnt, id_hit_cnt = 0, 0 for doc in docs_info: docid, docvec, mentions = doc for mention in mentions: (qid, kbids, commonnesses, vecs) = mention gold_mention = qid_mention_dict[qid] gold_id = gold_mention.kbid gold_id_is_nil = gold_id.startswith('NIL') if gold_id_is_nil: nil_mentions += 1 if not keep_nil and gold_id_is_nil: continue num_mentions += 1 indices, legal_kbids = __get_legal_kbids(kbids, keep_nil) if gold_id_is_nil and (len(legal_kbids) == 0 or legal_kbids[0].startswith('m.')): nil_hit_cnt += 1 continue first_kbid = legal_kbids[0] if legal_kbids else 'NIL' if first_kbid == gold_id: id_hit_cnt += 1 continue error_list.append( (qid, docid, gold_mention.name, gold_id, legal_kbids)) error_list.sort(key=lambda x: x[2]) for e in error_list: qid, docid, name, gold_id, legal_kbids = e gold_wid = eid_wid_dict.get(gold_id, -1) in_candidates = gold_id in legal_kbids if only_show_not_in_candidate and in_candidates: continue # if not in_candidates: # print 'not found' print '%s\t%s\t%s\t%s_%d' % (qid, docid, name, gold_id, gold_wid) # for eid in legal_kbids: # wid = eid_wid_dict.get(eid, -1) # print '\t%s_%d' % (eid, wid), # print print id_hit_cnt, num_mentions print 'INKB: %f' % (float(id_hit_cnt) / (num_mentions - nil_mentions)) print 'TOTAL: %f' % (float(id_hit_cnt + nil_hit_cnt) / num_mentions)