Beispiel #1
0
def __list_errors():
    gold_edl_file = 'e:/data/el/LDC2015E20/data/eval/data/mentions-raw.tab'
    sys_edl_file = 'e:/data/el/LDC2015E20/data/eval/output/emadr-result-coref.tab'
    eid_wid_file = 'e:/data/el/res/eid_wid_ord_eid.txt'

    eid_wid_dict = load_eid_wid_file(eid_wid_file)
    gold_mentions = Mention.load_edl_file(gold_edl_file)
    gold_qid_mentions = Mention.group_mentions_by_qid(gold_mentions)
    sys_mentions = Mention.load_edl_file(sys_edl_file)
    sys_qid_mentions = Mention.group_mentions_by_qid(sys_mentions)

    for qid, mention in gold_qid_mentions.iteritems():
        sys_mention = sys_qid_mentions[qid]
        if sys_mention.kbid == mention.kbid:
            continue
        if sys_mention.kbid.startswith('NIL') and mention.kbid.startswith(
                'NIL'):
            continue
        if mention.kbid.startswith('NIL'):
            continue
        wid_gold = eid_wid_dict.get(mention.kbid, -1)
        wid_sys = eid_wid_dict.get(sys_mention.kbid, -1)
        print '%s\t%s\t%s\t%s\t%d\t%d\t%s' % (
            qid, mention.kbid, sys_mention.kbid, mention.docid,
            mention.beg_pos, mention.end_pos, mention.name)
        print wid_gold, wid_sys
Beispiel #2
0
def __expand_location_names(mentions, tokenized_text_file,
                            entity_candidates_dict_file):
    doc_mentions_dict = Mention.group_mentions_by_docid(mentions)

    expansion_candidates = []
    f = open(tokenized_text_file, 'r')
    for line in f:
        vals = line.strip().split('\t')
        docid = vals[0]
        # print docid
        num_lines = int(vals[1])
        doc_mentions = doc_mentions_dict[docid]
        # print len(mentions)
        for i in xrange(num_lines):
            line = f.next().decode('utf-8')
            words = line.strip().split(' ')
            expansion_candidates += __find_expansion_candidates_in_location_mentions(
                doc_mentions, words)
            # break
        # break
    f.close()

    expansion_dict = __filter_expansion_candidates(
        expansion_candidates, entity_candidates_dict_file)
    qid_mentions = Mention.group_mentions_by_qid(mentions)
    for qid, mention in qid_mentions.iteritems():
        exp_name = expansion_dict.get(qid, '')
        if not exp_name:
            continue
        print '%s\t%s\t%s' % (qid, mention.name, exp_name)
        mention.name = exp_name
Beispiel #3
0
def __apply_coref(edl_file, linking_info_file, dst_edl_file):
    coref_dict = dict()
    f = open(linking_info_file, 'rb')
    while True:
        docid = ioutils.read_str_with_byte_len(f)
        if not docid:
            break
        num_mentions = np.fromfile(f, '>i4', 1)
        is_nested = np.fromfile(f, 'b', num_mentions)
        corefs = np.fromfile(f, '>i4', num_mentions)
        qids = list()
        for i in xrange(num_mentions):
            qid = __read_mention_from_linking_info_file(f)
            qids.append(qid)
        for coref_id, qid in izip(corefs, qids):
            if coref_id > 0:
                coref_dict[qid] = qids[coref_id]
    f.close()

    mentions = Mention.load_edl_file(edl_file)
    qid_mentions = Mention.group_mentions_by_qid(mentions)
    __assgin_different_id_to_all_nils(mentions)
    print qid_mentions['EDL14_ENG_0052'].kbid
    for m in mentions:
        if not m.kbid.startswith('NIL'):
            continue
        coref_qid = coref_dict.get(m.mention_id, '')
        if coref_qid:
            print m.mention_id, coref_qid, m.name, qid_mentions[coref_qid].kbid
            m.kbid = qid_mentions[coref_qid].kbid

    Mention.save_as_edl_file(mentions, dst_edl_file)
Beispiel #4
0
def __save_link_result(edl_file, result_triv, qids, kbids_list, y_pred,
                       max_scores, dst_file, use_nil_thres):
    mentions = Mention.load_edl_file(edl_file)
    for m in mentions:
        m.kbid = 'NODEF'

    qid_mentions = Mention.group_mentions_by_qid(mentions)
    for qid, kbid in result_triv.iteritems():
        qid_mentions[qid].kbid = kbid
        # print qid, kbid

    for qid, kbids, y, max_score in izip(qids, kbids_list, y_pred, max_scores):
        if y >= len(kbids):
            print y, len(kbids)
        if qid_mentions[qid].kbid == 'NODEF':
            if use_nil_thres and max_score < 0.5:
                qid_mentions[qid].kbid = 'NIL'
            else:
                qid_mentions[qid].kbid = kbids[y]
            # print qid, kbids[y]

    for m in mentions:
        if m.kbid.startswith('m.') or m.kbid.startswith('NIL'):
            m.kbid = 'NIL0001'

    Mention.save_as_edl_file(mentions, dst_file)
Beispiel #5
0
def __build_training_data(qid_x_list, edl_file):
    mentions = Mention.load_edl_file(edl_file)
    qid_mentions = Mention.group_mentions_by_qid(mentions)
    train_x = list()
    train_y = list()
    for tup in qid_x_list:
        qid, kbid, first_candidate, commonness, dist = tup
        # print qid, kbid, first_candidate, commonness, dist
        m = qid_mentions[qid]

        if (not m.kbid.startswith('NIL')) and m.kbid != kbid:
            continue

        y = 0 if m.kbid.startswith('NIL') else 1
        # train_x.append([first_candidate, commonness, dist])
        train_x.append([first_candidate, commonness])
        # train_x.append([first_candidate])
        train_y.append(y)
    return train_x, train_y
Beispiel #6
0
def __el_stat():
    data_file = 'e:/data/emadr/el/tac/2009/eval/el-2009-eval-expansion-nloc-3.bin'
    gold_file = 'e:/data/el/LDC2015E19/data/2009/eval/data/mentions-raw.tab'
    # data_file = 'e:/data/emadr/el/tac/2011/eval/el-2011-eval-expansion-all-3.bin'
    # gold_file = 'e:/data/el/LDC2015E19/data/2011/eval/data/mentions-expansion-all.tab'
    # data_file = 'e:/data/emadr/el/tac/2014/eval/el-2014-eval-raw-%d.bin' % 3
    # gold_file = 'e:/data/el/LDC2015E20/data/eval/data/mentions-raw.tab'
    eid_wid_file = 'e:/data/el/res/eid_wid_ord_eid.txt'
    keep_nil = True
    only_show_not_in_candidate = False

    eid_wid_dict = load_eid_wid_file(eid_wid_file)

    # gold_el_result = load_gold_el(gold_file)
    mentions = Mention.load_edl_file(gold_file)
    qid_mention_dict = Mention.group_mentions_by_qid(mentions)
    docs_info, dim = load_docs_info(data_file)

    error_list = list()
    num_mentions, nil_mentions = 0, 0
    nil_hit_cnt, id_hit_cnt = 0, 0
    for doc in docs_info:
        docid, docvec, mentions = doc
        for mention in mentions:
            (qid, kbids, commonnesses, vecs) = mention

            gold_mention = qid_mention_dict[qid]
            gold_id = gold_mention.kbid
            gold_id_is_nil = gold_id.startswith('NIL')
            if gold_id_is_nil:
                nil_mentions += 1
            if not keep_nil and gold_id_is_nil:
                continue
            num_mentions += 1

            indices, legal_kbids = __get_legal_kbids(kbids, keep_nil)

            if gold_id_is_nil and (len(legal_kbids) == 0
                                   or legal_kbids[0].startswith('m.')):
                nil_hit_cnt += 1
                continue

            first_kbid = legal_kbids[0] if legal_kbids else 'NIL'

            if first_kbid == gold_id:
                id_hit_cnt += 1
                continue

            error_list.append(
                (qid, docid, gold_mention.name, gold_id, legal_kbids))

    error_list.sort(key=lambda x: x[2])
    for e in error_list:
        qid, docid, name, gold_id, legal_kbids = e
        gold_wid = eid_wid_dict.get(gold_id, -1)
        in_candidates = gold_id in legal_kbids

        if only_show_not_in_candidate and in_candidates:
            continue

        # if not in_candidates:
        #     print 'not found'
        print '%s\t%s\t%s\t%s_%d' % (qid, docid, name, gold_id, gold_wid)

        # for eid in legal_kbids:
        #     wid = eid_wid_dict.get(eid, -1)
        #     print '\t%s_%d' % (eid, wid),
        # print

    print id_hit_cnt, num_mentions
    print 'INKB: %f' % (float(id_hit_cnt) / (num_mentions - nil_mentions))
    print 'TOTAL: %f' % (float(id_hit_cnt + nil_hit_cnt) / num_mentions)