def __load_mention_info(fin, vecdim): qid = ioutils.read_str_with_byte_len(fin) num_candidates = np.fromfile(fin, '>i4', 1) eids = [ioutils.read_str_with_byte_len(fin) for _ in xrange(num_candidates)] commonnesses = np.fromfile(fin, '>f4', num_candidates) vecs = [np.fromfile(fin, '>f4', vecdim) for _ in xrange(num_candidates)] return qid, eids, commonnesses, vecs
def __read_mention_data(fin, vec_dim): qid = ioutils.read_str_with_byte_len(fin) gold_id = ioutils.read_str_with_byte_len(fin) num_candidates = np.fromfile(fin, np.int32, 1) eids = [ ioutils.read_str_with_byte_len(fin) for _ in xrange(num_candidates) ] commonnesses = np.fromfile(fin, np.float32, num_candidates) vecs = [ np.fromfile(fin, np.float32, vec_dim) for _ in xrange(num_candidates) ] return qid, gold_id, eids, commonnesses, vecs
def __apply_coref(edl_file, linking_info_file, dst_edl_file): coref_dict = dict() f = open(linking_info_file, 'rb') while True: docid = ioutils.read_str_with_byte_len(f) if not docid: break num_mentions = np.fromfile(f, '>i4', 1) is_nested = np.fromfile(f, 'b', num_mentions) corefs = np.fromfile(f, '>i4', num_mentions) qids = list() for i in xrange(num_mentions): qid = __read_mention_from_linking_info_file(f) qids.append(qid) for coref_id, qid in izip(corefs, qids): if coref_id > 0: coref_dict[qid] = qids[coref_id] f.close() mentions = Mention.load_edl_file(edl_file) qid_mentions = Mention.group_mentions_by_qid(mentions) __assgin_different_id_to_all_nils(mentions) print qid_mentions['EDL14_ENG_0052'].kbid for m in mentions: if not m.kbid.startswith('NIL'): continue coref_qid = coref_dict.get(m.mention_id, '') if coref_qid: print m.mention_id, coref_qid, m.name, qid_mentions[coref_qid].kbid m.kbid = qid_mentions[coref_qid].kbid Mention.save_as_edl_file(mentions, dst_edl_file)
def __read_mention_from_linking_info_file(fin): qid = ioutils.read_str_with_byte_len(fin) num_candidates = np.fromfile(fin, '>i4', 1) # print num_candidates for i in xrange(num_candidates): np.fromfile(fin, 'b', 8) np.fromfile(fin, '>f4', 1) np.fromfile(fin, '>f8', 1) np.fromfile(fin, '>f4', 1) return qid
def __filter_expansion_candidates(expansion_candidates, entity_candidates_dict_file): name_qids_dict = dict() for tup in expansion_candidates: qids = name_qids_dict.get(tup[1].lower(), list()) if not qids: name_qids_dict[tup[1].lower()] = qids if tup[0] not in qids: qids.append(tup[0]) expansion_dict = dict() # for tup in expansion_candidates: # expansion_dict[tup[0]] = tup[1] # print len(expansion_dict) f = open(entity_candidates_dict_file, 'rb') num_names, total_num_cands = np.fromfile(f, '>i4', 2) print num_names for i in xrange(num_names): name = ioutils.read_str_with_byte_len(f) # print name num_cands = np.fromfile(f, '>i2', 1) if num_cands == 0: continue qids = name_qids_dict.get(name, []) for qid in qids: expansion_dict[qid] = name # print num_cands for _ in xrange(num_cands): ioutils.read_str_with_byte_len(f) np.fromfile(f, '>f4', 1) if i % 1000000 == 0: print i f.close() for qid, name in expansion_dict.iteritems(): print qid, name print len(expansion_dict) return expansion_dict
def __load_mention_info(fin, vec_dim): qid = ioutils.read_str_with_byte_len(fin) # print qid # if qid == '': # print doc_id, j, num_mentions # gold_label = 'NIL' candidates = list() num_candidates = np.fromfile(fin, '>i4', 1) for k in xrange(num_candidates): mid = ioutils.read_str_with_fixed_len(fin, 8) commonness = np.fromfile(fin, '>f4', 1) vec = np.fromfile(fin, '>f4', vec_dim) candidates.append((mid, commonness, vec)) return qid, candidates
def __make_labeled_data(vec_train_file, gold_label_file, mid_eid_file, dst_file): mid_eid_dict = load_mid_eid_file(mid_eid_file) gold_id_dict = load_gold_id_file(gold_label_file) vec_dim = 100 fin = open(vec_train_file, 'rb') num_docs = np.fromfile(fin, '>i4', 1) print num_docs, 'documents' fout = open(dst_file, 'wb') np.asarray([num_docs, vec_dim], np.int32).tofile(fout) # tmp_fout = open('e:/data/emadr/el/tmp_result.txt', 'wb') total_num_mentions = 0 candidates_list = list() for i in xrange(num_docs): doc_id = ioutils.read_str_with_byte_len(fin) doc_vec = np.fromfile(fin, '>f4', vec_dim) # if i < 5: # print doc_vec doc_vec.astype(np.float32).tofile(fout) num_mentions = np.fromfile(fin, '>i4', 1) np.asarray([num_mentions], np.int32).tofile(fout) total_num_mentions += num_mentions for j in xrange(num_mentions): qid, candidates = __load_mention_info(fin, vec_dim) gold_id = gold_id_dict[qid] print qid, gold_id __write_mention_data(qid, gold_id, candidates, mid_eid_dict, fout) for candidate in candidates: mid, commonness, vec = candidate eid = mid_eid_dict.get(mid, '') print '\t%s\t%f\t%s' % (mid, commonness, eid) # tmp_fout.close() print total_num_mentions
def load_docs_info(xdatafile): f = open(xdatafile, 'rb') num_docs, vecdim = np.fromfile(f, '>i4', 2) print '%d documents, vec dimention: %d' % (num_docs, vecdim) docs = list() for i in xrange(num_docs): docid = ioutils.read_str_with_byte_len(f) # print docid docvec = np.fromfile(f, '>f4', vecdim) num_mentions = np.fromfile(f, '>i4', 1) # print num_mentions mentions = list() for j in xrange(num_mentions): qid, kbids, commonnesses, vecs = __load_mention_info(f, vecdim) # print vecs # print qid, kbids # print commonnesses # print kbids[2] mentions.append((qid, kbids, commonnesses, vecs)) docs.append((docid, docvec, mentions)) # if i == 5: # break f.close() return docs, vecdim
def add_gold_label(vec_train_file, gold_label_file, mid_eid_file, dst_file): mid_eid_dict = load_mid_eid_file(mid_eid_file) label_dict = load_gold_id_file(gold_label_file) vec_dim = 100 nil_cnt = 0 miss_cnt = 0 fh_cnt = 0 nil_hit_cnt = 0 tmp_fout = open('e:/data/emadr/el/tmp_result.txt', 'wb') fin = open(vec_train_file, 'rb') num_docs = np.fromfile(fin, '>i4', 1) print num_docs fout = open(dst_file, 'wb') np.asarray([num_docs, vec_dim], np.int32).tofile(fout) candidates_list = list() for i in xrange(num_docs): doc_id = ioutils.read_str_with_byte_len(fin) doc_vec = np.fromfile(fin, '>f4', vec_dim) # if i < 5: # print doc_vec doc_vec.astype(np.float32).tofile(fout) mention_infos = list() num_mentions = np.fromfile(fin, '>i4', 1) for j in xrange(num_mentions): qid = ioutils.read_str_with_byte_len(fin) # print qid # if qid == '': # print doc_id, j, num_mentions # gold_label = 'NIL' num_candidates = np.fromfile(fin, '>i4', 1) gold_label = label_dict[qid] cur_candidates = list() cur_candidates_tup = (qid, cur_candidates) candidates_list.append(cur_candidates_tup) hit_idx = -1 commonness = list() candidate_vecs = list() eids = list() all_nil = True non_nil_cnt = -1 for k in xrange(num_candidates): mid = ioutils.read_str_with_fixed_len(fin, 8) eid = mid_eid_dict.get(mid, 'NILL') if eid != 'NILL': all_nil = False non_nil_cnt += 1 cur_candidates.append(eid) if k == 0 and eid == 'NILL': tmp_fout.write(qid + '\t' + eid + '\n') if eid == gold_label: # hit_idx = k hit_idx = non_nil_cnt cur_com = np.fromfile(fin, '>f4', 1) # print cur_com vec = np.fromfile(fin, '>f4', vec_dim) if eid != 'NILL': commonness.append(cur_com) candidate_vecs.append(vec.astype(np.float32)) eids.append(eid) if hit_idx == -1: miss_cnt += 1 else: # mention_infos.append((qid, hit_idx, candidate_vecs, eids)) mention_infos.append((hit_idx, commonness, candidate_vecs)) # print commonness # print if hit_idx == 0: fh_cnt += 1 if gold_label.startswith('NIL'): nil_cnt += 1 if all_nil: nil_hit_cnt += 1 # print len(mention_infos) np.asarray([len(mention_infos)], np.int32).tofile(fout) for mention_info in mention_infos: # io_utils.write_str_with_byte_len(mention_info[0], fout) np.asarray([len(mention_info[1])], np.int32).tofile(fout) np.asarray([mention_info[0]], np.int32).tofile(fout) np.asarray(mention_info[1], np.float32).tofile(fout) for vec in mention_info[2]: vec.tofile(fout) # for eid in mention_info[3]: # io_utils.write_str_with_byte_len(eid, fout) # break fin.close() fout.close() tmp_fout.close() candidates_list.sort(key=lambda x: x[0]) __print_candidates(candidates_list) num_queries = len(label_dict) num_non_nil_queries = num_queries - nil_cnt print 'nil_cnt\tmiss_cnt\tfh_cnt\tnum_queries\tnum_non_nil_queries' print nil_cnt, miss_cnt, fh_cnt, num_queries, num_non_nil_queries, nil_hit_cnt print float(fh_cnt) / num_non_nil_queries print 1 - float(miss_cnt - nil_cnt) / num_non_nil_queries print float(num_queries - miss_cnt + nil_hit_cnt) / num_queries print 1 - float(miss_cnt - nil_cnt) / num_queries
def add_gold_label(vec_train_file, gold_label_file, mid_eid_file, dst_file): mid_eid_dict = load_mid_eid_file(mid_eid_file) label_dict = load_gold_label_file(gold_label_file) vec_dim = 100 nil_cnt = 0 miss_cnt = 0 fh_cnt = 0 tmp_fout = open('e:/dc/el/tmp_result.txt', 'wb') fin = open(vec_train_file, 'rb') num_docs = np.fromfile(fin, '>i4', 1) print num_docs fout = open(dst_file, 'wb') np.asarray([num_docs, vec_dim], np.int32).tofile(fout) for i in xrange(num_docs): doc_id = ioutils.read_str_with_byte_len(fin) doc_vec = np.fromfile(fin, '>f4', vec_dim) doc_vec.astype(np.float32).tofile(fout) mention_infos = list() num_mentions = np.fromfile(fin, '>i4', 1) for j in xrange(num_mentions): qid = ioutils.read_str_with_byte_len(fin) # print qid # if qid == '': # print doc_id, j, num_mentions gold_label = label_dict[qid] # gold_label = 'NIL' if gold_label.startswith('NIL'): nil_cnt += 1 num_candidates = np.fromfile(fin, '>i4', 1) hit_idx = -1 commonness = list() candidate_vecs = list() eids = list() for k in xrange(num_candidates): mid = ioutils.read_str_with_fixed_len(fin, 8) eid = mid_eid_dict.get(mid, 'NILL') if k == 0 and eid != 'NILL': tmp_fout.write(qid + '\t' + eid + '\n') if eid == gold_label: hit_idx = k cur_com = np.fromfile(fin, '>f4', 1) # print cur_com commonness.append(cur_com) vec = np.fromfile(fin, '>f4', vec_dim) candidate_vecs.append(vec.astype(np.float32)) eids.append(eid) if hit_idx == -1: miss_cnt += 1 else: # mention_infos.append((qid, hit_idx, candidate_vecs, eids)) mention_infos.append((hit_idx, commonness, candidate_vecs)) if hit_idx == 0: fh_cnt += 1 # print len(mention_infos) np.asarray([len(mention_infos)], np.int32).tofile(fout) for mention_info in mention_infos: # io_utils.write_str_with_byte_len(mention_info[0], fout) np.asarray([len(mention_info[1])], np.int32).tofile(fout) np.asarray([mention_info[0]], np.int32).tofile(fout) np.asarray(mention_info[1], np.float32).tofile(fout) for vec in mention_info[2]: vec.tofile(fout) # for eid in mention_info[3]: # io_utils.write_str_with_byte_len(eid, fout) # break fin.close() fout.close() tmp_fout.close() num_queries = len(label_dict) num_non_nil_queries = num_queries - nil_cnt print 'nil_cnt\tmiss_cnt\tfh_cnt\tnum_queries\tnum_non_nil_queries' print nil_cnt, miss_cnt, fh_cnt, num_queries, num_non_nil_queries print float(fh_cnt) / num_non_nil_queries print 1 - float(miss_cnt - nil_cnt) / num_non_nil_queries