Example #1
0
def __load_mention_info(fin, vecdim):
    qid = ioutils.read_str_with_byte_len(fin)
    num_candidates = np.fromfile(fin, '>i4', 1)
    eids = [ioutils.read_str_with_byte_len(fin) for _ in xrange(num_candidates)]
    commonnesses = np.fromfile(fin, '>f4', num_candidates)
    vecs = [np.fromfile(fin, '>f4', vecdim) for _ in xrange(num_candidates)]
    return qid, eids, commonnesses, vecs
Example #2
0
def __read_mention_data(fin, vec_dim):
    qid = ioutils.read_str_with_byte_len(fin)
    gold_id = ioutils.read_str_with_byte_len(fin)
    num_candidates = np.fromfile(fin, np.int32, 1)
    eids = [
        ioutils.read_str_with_byte_len(fin) for _ in xrange(num_candidates)
    ]
    commonnesses = np.fromfile(fin, np.float32, num_candidates)
    vecs = [
        np.fromfile(fin, np.float32, vec_dim) for _ in xrange(num_candidates)
    ]
    return qid, gold_id, eids, commonnesses, vecs
Example #3
0
def __apply_coref(edl_file, linking_info_file, dst_edl_file):
    coref_dict = dict()
    f = open(linking_info_file, 'rb')
    while True:
        docid = ioutils.read_str_with_byte_len(f)
        if not docid:
            break
        num_mentions = np.fromfile(f, '>i4', 1)
        is_nested = np.fromfile(f, 'b', num_mentions)
        corefs = np.fromfile(f, '>i4', num_mentions)
        qids = list()
        for i in xrange(num_mentions):
            qid = __read_mention_from_linking_info_file(f)
            qids.append(qid)
        for coref_id, qid in izip(corefs, qids):
            if coref_id > 0:
                coref_dict[qid] = qids[coref_id]
    f.close()

    mentions = Mention.load_edl_file(edl_file)
    qid_mentions = Mention.group_mentions_by_qid(mentions)
    __assgin_different_id_to_all_nils(mentions)
    print qid_mentions['EDL14_ENG_0052'].kbid
    for m in mentions:
        if not m.kbid.startswith('NIL'):
            continue
        coref_qid = coref_dict.get(m.mention_id, '')
        if coref_qid:
            print m.mention_id, coref_qid, m.name, qid_mentions[coref_qid].kbid
            m.kbid = qid_mentions[coref_qid].kbid

    Mention.save_as_edl_file(mentions, dst_edl_file)
Example #4
0
def __read_mention_from_linking_info_file(fin):
    qid = ioutils.read_str_with_byte_len(fin)
    num_candidates = np.fromfile(fin, '>i4', 1)
    # print num_candidates
    for i in xrange(num_candidates):
        np.fromfile(fin, 'b', 8)
        np.fromfile(fin, '>f4', 1)
        np.fromfile(fin, '>f8', 1)
        np.fromfile(fin, '>f4', 1)
    return qid
Example #5
0
def __filter_expansion_candidates(expansion_candidates,
                                  entity_candidates_dict_file):
    name_qids_dict = dict()
    for tup in expansion_candidates:
        qids = name_qids_dict.get(tup[1].lower(), list())
        if not qids:
            name_qids_dict[tup[1].lower()] = qids
        if tup[0] not in qids:
            qids.append(tup[0])
    expansion_dict = dict()
    # for tup in expansion_candidates:
    #     expansion_dict[tup[0]] = tup[1]
    # print len(expansion_dict)

    f = open(entity_candidates_dict_file, 'rb')
    num_names, total_num_cands = np.fromfile(f, '>i4', 2)
    print num_names
    for i in xrange(num_names):
        name = ioutils.read_str_with_byte_len(f)
        # print name
        num_cands = np.fromfile(f, '>i2', 1)
        if num_cands == 0:
            continue

        qids = name_qids_dict.get(name, [])
        for qid in qids:
            expansion_dict[qid] = name

        # print num_cands
        for _ in xrange(num_cands):
            ioutils.read_str_with_byte_len(f)
            np.fromfile(f, '>f4', 1)

        if i % 1000000 == 0:
            print i
    f.close()

    for qid, name in expansion_dict.iteritems():
        print qid, name
    print len(expansion_dict)
    return expansion_dict
Example #6
0
def __load_mention_info(fin, vec_dim):
    qid = ioutils.read_str_with_byte_len(fin)
    # print qid
    # if qid == '':
    #     print doc_id, j, num_mentions
    # gold_label = 'NIL'

    candidates = list()
    num_candidates = np.fromfile(fin, '>i4', 1)
    for k in xrange(num_candidates):
        mid = ioutils.read_str_with_fixed_len(fin, 8)
        commonness = np.fromfile(fin, '>f4', 1)
        vec = np.fromfile(fin, '>f4', vec_dim)
        candidates.append((mid, commonness, vec))
    return qid, candidates
Example #7
0
def __make_labeled_data(vec_train_file, gold_label_file, mid_eid_file,
                        dst_file):
    mid_eid_dict = load_mid_eid_file(mid_eid_file)
    gold_id_dict = load_gold_id_file(gold_label_file)

    vec_dim = 100

    fin = open(vec_train_file, 'rb')
    num_docs = np.fromfile(fin, '>i4', 1)
    print num_docs, 'documents'

    fout = open(dst_file, 'wb')
    np.asarray([num_docs, vec_dim], np.int32).tofile(fout)

    # tmp_fout = open('e:/data/emadr/el/tmp_result.txt', 'wb')
    total_num_mentions = 0
    candidates_list = list()
    for i in xrange(num_docs):
        doc_id = ioutils.read_str_with_byte_len(fin)
        doc_vec = np.fromfile(fin, '>f4', vec_dim)
        # if i < 5:
        #     print doc_vec

        doc_vec.astype(np.float32).tofile(fout)

        num_mentions = np.fromfile(fin, '>i4', 1)
        np.asarray([num_mentions], np.int32).tofile(fout)

        total_num_mentions += num_mentions
        for j in xrange(num_mentions):
            qid, candidates = __load_mention_info(fin, vec_dim)
            gold_id = gold_id_dict[qid]
            print qid, gold_id
            __write_mention_data(qid, gold_id, candidates, mid_eid_dict, fout)
            for candidate in candidates:
                mid, commonness, vec = candidate
                eid = mid_eid_dict.get(mid, '')
                print '\t%s\t%f\t%s' % (mid, commonness, eid)
    # tmp_fout.close()
    print total_num_mentions
Example #8
0
def load_docs_info(xdatafile):
    f = open(xdatafile, 'rb')
    num_docs, vecdim = np.fromfile(f, '>i4', 2)
    print '%d documents, vec dimention: %d' % (num_docs, vecdim)
    docs = list()
    for i in xrange(num_docs):
        docid = ioutils.read_str_with_byte_len(f)
        # print docid
        docvec = np.fromfile(f, '>f4', vecdim)
        num_mentions = np.fromfile(f, '>i4', 1)
        # print num_mentions
        mentions = list()
        for j in xrange(num_mentions):
            qid, kbids, commonnesses, vecs = __load_mention_info(f, vecdim)
            # print vecs
            # print qid, kbids
            # print commonnesses
            # print kbids[2]
            mentions.append((qid, kbids, commonnesses, vecs))
        docs.append((docid, docvec, mentions))
        # if i == 5:
        #     break
    f.close()
    return docs, vecdim
Example #9
0
def add_gold_label(vec_train_file, gold_label_file, mid_eid_file, dst_file):
    mid_eid_dict = load_mid_eid_file(mid_eid_file)
    label_dict = load_gold_id_file(gold_label_file)

    vec_dim = 100

    nil_cnt = 0
    miss_cnt = 0
    fh_cnt = 0
    nil_hit_cnt = 0
    tmp_fout = open('e:/data/emadr/el/tmp_result.txt', 'wb')
    fin = open(vec_train_file, 'rb')
    num_docs = np.fromfile(fin, '>i4', 1)
    print num_docs
    fout = open(dst_file, 'wb')
    np.asarray([num_docs, vec_dim], np.int32).tofile(fout)
    candidates_list = list()
    for i in xrange(num_docs):
        doc_id = ioutils.read_str_with_byte_len(fin)
        doc_vec = np.fromfile(fin, '>f4', vec_dim)
        # if i < 5:
        #     print doc_vec

        doc_vec.astype(np.float32).tofile(fout)

        mention_infos = list()
        num_mentions = np.fromfile(fin, '>i4', 1)
        for j in xrange(num_mentions):
            qid = ioutils.read_str_with_byte_len(fin)
            # print qid
            # if qid == '':
            #     print doc_id, j, num_mentions
            # gold_label = 'NIL'

            num_candidates = np.fromfile(fin, '>i4', 1)

            gold_label = label_dict[qid]

            cur_candidates = list()
            cur_candidates_tup = (qid, cur_candidates)
            candidates_list.append(cur_candidates_tup)

            hit_idx = -1
            commonness = list()
            candidate_vecs = list()
            eids = list()
            all_nil = True
            non_nil_cnt = -1
            for k in xrange(num_candidates):
                mid = ioutils.read_str_with_fixed_len(fin, 8)
                eid = mid_eid_dict.get(mid, 'NILL')
                if eid != 'NILL':
                    all_nil = False
                    non_nil_cnt += 1

                cur_candidates.append(eid)

                if k == 0 and eid == 'NILL':
                    tmp_fout.write(qid + '\t' + eid + '\n')
                if eid == gold_label:
                    # hit_idx = k
                    hit_idx = non_nil_cnt

                cur_com = np.fromfile(fin, '>f4', 1)
                # print cur_com
                vec = np.fromfile(fin, '>f4', vec_dim)

                if eid != 'NILL':
                    commonness.append(cur_com)
                    candidate_vecs.append(vec.astype(np.float32))
                eids.append(eid)

            if hit_idx == -1:
                miss_cnt += 1
            else:
                # mention_infos.append((qid, hit_idx, candidate_vecs, eids))
                mention_infos.append((hit_idx, commonness, candidate_vecs))
                # print commonness
                # print
                if hit_idx == 0:
                    fh_cnt += 1

            if gold_label.startswith('NIL'):
                nil_cnt += 1
                if all_nil:
                    nil_hit_cnt += 1

        # print len(mention_infos)
        np.asarray([len(mention_infos)], np.int32).tofile(fout)
        for mention_info in mention_infos:
            # io_utils.write_str_with_byte_len(mention_info[0], fout)
            np.asarray([len(mention_info[1])], np.int32).tofile(fout)
            np.asarray([mention_info[0]], np.int32).tofile(fout)
            np.asarray(mention_info[1], np.float32).tofile(fout)
            for vec in mention_info[2]:
                vec.tofile(fout)
            # for eid in mention_info[3]:
            #     io_utils.write_str_with_byte_len(eid, fout)
        # break
    fin.close()
    fout.close()
    tmp_fout.close()

    candidates_list.sort(key=lambda x: x[0])
    __print_candidates(candidates_list)

    num_queries = len(label_dict)
    num_non_nil_queries = num_queries - nil_cnt
    print 'nil_cnt\tmiss_cnt\tfh_cnt\tnum_queries\tnum_non_nil_queries'
    print nil_cnt, miss_cnt, fh_cnt, num_queries, num_non_nil_queries, nil_hit_cnt
    print float(fh_cnt) / num_non_nil_queries
    print 1 - float(miss_cnt - nil_cnt) / num_non_nil_queries
    print float(num_queries - miss_cnt + nil_hit_cnt) / num_queries
    print 1 - float(miss_cnt - nil_cnt) / num_queries
Example #10
0
def add_gold_label(vec_train_file, gold_label_file, mid_eid_file, dst_file):
    mid_eid_dict = load_mid_eid_file(mid_eid_file)
    label_dict = load_gold_label_file(gold_label_file)

    vec_dim = 100

    nil_cnt = 0
    miss_cnt = 0
    fh_cnt = 0
    tmp_fout = open('e:/dc/el/tmp_result.txt', 'wb')
    fin = open(vec_train_file, 'rb')
    num_docs = np.fromfile(fin, '>i4', 1)
    print num_docs
    fout = open(dst_file, 'wb')
    np.asarray([num_docs, vec_dim], np.int32).tofile(fout)
    for i in xrange(num_docs):
        doc_id = ioutils.read_str_with_byte_len(fin)
        doc_vec = np.fromfile(fin, '>f4', vec_dim)

        doc_vec.astype(np.float32).tofile(fout)

        mention_infos = list()
        num_mentions = np.fromfile(fin, '>i4', 1)
        for j in xrange(num_mentions):
            qid = ioutils.read_str_with_byte_len(fin)
            # print qid
            # if qid == '':
            #     print doc_id, j, num_mentions
            gold_label = label_dict[qid]
            # gold_label = 'NIL'
            if gold_label.startswith('NIL'):
                nil_cnt += 1

            num_candidates = np.fromfile(fin, '>i4', 1)
            hit_idx = -1
            commonness = list()
            candidate_vecs = list()
            eids = list()
            for k in xrange(num_candidates):
                mid = ioutils.read_str_with_fixed_len(fin, 8)
                eid = mid_eid_dict.get(mid, 'NILL')

                if k == 0 and eid != 'NILL':
                    tmp_fout.write(qid + '\t' + eid + '\n')
                if eid == gold_label:
                    hit_idx = k

                cur_com = np.fromfile(fin, '>f4', 1)
                # print cur_com
                commonness.append(cur_com)
                vec = np.fromfile(fin, '>f4', vec_dim)
                candidate_vecs.append(vec.astype(np.float32))
                eids.append(eid)

            if hit_idx == -1:
                miss_cnt += 1
            else:
                # mention_infos.append((qid, hit_idx, candidate_vecs, eids))
                mention_infos.append((hit_idx, commonness, candidate_vecs))
                if hit_idx == 0:
                    fh_cnt += 1

        # print len(mention_infos)
        np.asarray([len(mention_infos)], np.int32).tofile(fout)
        for mention_info in mention_infos:
            # io_utils.write_str_with_byte_len(mention_info[0], fout)
            np.asarray([len(mention_info[1])], np.int32).tofile(fout)
            np.asarray([mention_info[0]], np.int32).tofile(fout)
            np.asarray(mention_info[1], np.float32).tofile(fout)
            for vec in mention_info[2]:
                vec.tofile(fout)
            # for eid in mention_info[3]:
            #     io_utils.write_str_with_byte_len(eid, fout)
        # break
    fin.close()
    fout.close()
    tmp_fout.close()

    num_queries = len(label_dict)
    num_non_nil_queries = num_queries - nil_cnt
    print 'nil_cnt\tmiss_cnt\tfh_cnt\tnum_queries\tnum_non_nil_queries'
    print nil_cnt, miss_cnt, fh_cnt, num_queries, num_non_nil_queries
    print float(fh_cnt) / num_non_nil_queries
    print 1 - float(miss_cnt - nil_cnt) / num_non_nil_queries