コード例 #1
0
ファイル: processdataset.py プロジェクト: hldai/tac-edl
def prev_mentions_format_to_new(tab_file, xml_file, output_file):
    xml_text = __read_text_file(xml_file)
    miter = re.finditer(xml_mention_pattern_str, xml_text)
    mentions_dict = dict()
    beg_pos_dict = dict()
    for m in miter:
        cur_doc_id = m.group(3)
        mention = Mention(name=m.group(2), docid=cur_doc_id, mention_id=m.group(1))
        doc_beg = beg_pos_dict.get(cur_doc_id, 0)  # TODO
        mention.beg_pos = doc_beg
        mention.end_pos = doc_beg + len(mention.name.encode('utf-8')) - 1
        beg_pos_dict[cur_doc_id] = mention.end_pos + 1
        mentions_dict[mention.mention_id] = mention

    f = open(tab_file, 'r')
    for line in f:
        vals = line.strip().split('\t')
        if len(vals) < 3:
            continue
        m = mentions_dict.get(vals[0], None)
        if m:
            m.kbid = vals[1]
            m.entity_type = vals[2]
    f.close()

    Mention.save_as_edl_file(mentions_dict.values(), output_file)
コード例 #2
0
def __apply_coref(edl_file, linking_info_file, dst_edl_file):
    coref_dict = dict()
    f = open(linking_info_file, 'rb')
    while True:
        docid = ioutils.read_str_with_byte_len(f)
        if not docid:
            break
        num_mentions = np.fromfile(f, '>i4', 1)
        is_nested = np.fromfile(f, 'b', num_mentions)
        corefs = np.fromfile(f, '>i4', num_mentions)
        qids = list()
        for i in xrange(num_mentions):
            qid = __read_mention_from_linking_info_file(f)
            qids.append(qid)
        for coref_id, qid in izip(corefs, qids):
            if coref_id > 0:
                coref_dict[qid] = qids[coref_id]
    f.close()

    mentions = Mention.load_edl_file(edl_file)
    qid_mentions = Mention.group_mentions_by_qid(mentions)
    __assgin_different_id_to_all_nils(mentions)
    print qid_mentions['EDL14_ENG_0052'].kbid
    for m in mentions:
        if not m.kbid.startswith('NIL'):
            continue
        coref_qid = coref_dict.get(m.mention_id, '')
        if coref_qid:
            print m.mention_id, coref_qid, m.name, qid_mentions[coref_qid].kbid
            m.kbid = qid_mentions[coref_qid].kbid

    Mention.save_as_edl_file(mentions, dst_edl_file)
コード例 #3
0
ファイル: elvectrain.py プロジェクト: hldai/emadr-exp
def __save_link_result(edl_file, result_triv, qids, kbids_list, y_pred,
                       max_scores, dst_file, use_nil_thres):
    mentions = Mention.load_edl_file(edl_file)
    for m in mentions:
        m.kbid = 'NODEF'

    qid_mentions = Mention.group_mentions_by_qid(mentions)
    for qid, kbid in result_triv.iteritems():
        qid_mentions[qid].kbid = kbid
        # print qid, kbid

    for qid, kbids, y, max_score in izip(qids, kbids_list, y_pred, max_scores):
        if y >= len(kbids):
            print y, len(kbids)
        if qid_mentions[qid].kbid == 'NODEF':
            if use_nil_thres and max_score < 0.5:
                qid_mentions[qid].kbid = 'NIL'
            else:
                qid_mentions[qid].kbid = kbids[y]
            # print qid, kbids[y]

    for m in mentions:
        if m.kbid.startswith('m.') or m.kbid.startswith('NIL'):
            m.kbid = 'NIL0001'

    Mention.save_as_edl_file(mentions, dst_file)
コード例 #4
0
ファイル: postprocess.py プロジェクト: hldai/tac-edl
def __nil_clustering(nom_dict_file, edl_file, dst_file):
    nom_names = load_nom_dict(nom_dict_file)
    all_mentions = Mention.load_edl_file(edl_file)
    nil_mentions = [m for m in all_mentions if m.kbid.startswith('NIL') and m.name.lower() not in nom_names]
    kbid_mentions = __group_mentions_by_kbid(nil_mentions)

    new_kbids, new_mentions_kbids = list(), list()
    for kbid, mentions in kbid_mentions.iteritems():
        merged = False
        for nkbid, nmentions in izip(new_kbids, new_mentions_kbids):
            if __should_merge(mentions, nmentions):
                # for m in mentions:
                #     print '%s\t' % m.name,
                # print
                # for m in nmentions:
                #     print '%s\t' % m.name,
                # print '\n'

                for m in mentions:
                    m.kbid = nkbid
                    nmentions.append(m)
                merged = True
                break

        if not merged:
            new_kbids.append(kbid)
            new_mentions_kbids.append(mentions)

    Mention.save_as_edl_file(all_mentions, dst_file)
コード例 #5
0
ファイル: postprocess.py プロジェクト: hldai/tac-edl
def main():
    # dataset = 'LDC2015E75'
    dataset = 'LDC2015E103'
    # dataset = 'LDC2016E63'
    mentions_tag = '0'
    run_id = 4

    # datadir = '/home/dhl/data/EDL/'
    datadir = 'e:/data/edl'

    doc_list_file = os.path.join(datadir, dataset, 'data/eng-docs-list-win.txt')
    mid_type_file = os.path.join(datadir, 'res/freebase/mid-entity-type.txt')
    cur_edl_file = os.path.join(datadir, dataset, 'output/sys-link-sm-%s.tab' % mentions_tag)
    miss_match_mentions_file = os.path.join(datadir, dataset, 'output/miss-match-mentions-%s.txt' % mentions_tag)
    new_edl_file = os.path.join(datadir, dataset, 'output/sys-link-sm-pp-ft-%d.tab' % run_id)
    # __nil_clustering(nom_dict_file, edl_file, dst_file)
    mentions = Mention.load_edl_file(cur_edl_file)

    # __link_nom(doc_mentions_dict, max_nil_id)

    __nil_author_clustering(mentions)
    __fix_special_types(mentions)
    __fix_entity_types_by_mid(mid_type_file, mentions)
    # __fix_type_diff_of_same_kbid(mentions)
    __validate_mentions(doc_list_file, mentions, miss_match_mentions_file)
    __fix_pos_error(mentions)
    Mention.save_as_edl_file(mentions, new_edl_file, runid='WednesdayGo%d' % run_id)
コード例 #6
0
ファイル: arrangeneroutput.py プロジェクト: hldai/tac-edl
def __remove_leading_the(metions_file, dst_mentions_edl_file):
    mentions = Mention.load_edl_file(metions_file)
    for m in mentions:
        if m.name.startswith('the '):
            m.name = m.name[4:]
            m.beg_pos += 4

    Mention.save_as_edl_file(mentions, dst_mentions_edl_file)
コード例 #7
0
ファイル: nomdiscover.py プロジェクト: hldai/tac-edl
def __extract_nom_mentions(nom_dict_file, doc_list_file, words_pos_file, dst_nom_mentions_file):
    noms = load_nom_dict(nom_dict_file)
    nom_name_list = [n for n in noms]
    nom_name_list.sort(key=lambda x: -len(x))
    nom_name_list = [n.split(' ') for n in nom_name_list]

    doc_path_dict = __load_doc_paths_as_dict(doc_list_file)

    mentions = list()
    f_wp = open(words_pos_file, 'r')
    for i, line in enumerate(f_wp):
        vals = line.rstrip().split('\t')
        docid = vals[0]

        if (i + 1) % 10 == 0:
            print i + 1, docid

        doc_path = doc_path_dict[docid]
        doc_text = read_text(doc_path).decode('utf-8')
        if doc_text.startswith(doc_head):
            doc_text = doc_text[len(doc_head):]

        num_sentences = int(vals[1])
        for j in xrange(num_sentences):
            sentence = __next_sentence_in_words_pos_file(f_wp)
            words = [tup[0].lower() for tup in sentence]
            # print words
            hit_spans, hit_indices = find_phrases_in_words(nom_name_list, words, False)
            for hit_span, hit_idx in izip(hit_spans, hit_indices):
                beg_pos = sentence[hit_span[0]][3]
                end_pos = sentence[hit_span[1] - 1][4]

                tags = [tup[2] for tup in sentence[hit_span[0]:hit_span[1]]]
                # print tags
                # if 'NN' not in tags and 'NNP' not in tags:
                #     continue
                if 'NN' not in tags:
                    continue

                name = doc_text[beg_pos:end_pos + 1].replace('\n', ' ')
                if '&lt;' in name or 'http:' in name or '&gt;' in name:
                    continue
                m = Mention(name=name, beg_pos=beg_pos, end_pos=end_pos, docid=docid, mention_type='NOM',
                            entity_type='PER', kbid='NIL00000')
                mentions.append(m)
                # print sentence[hit_span[0]], sentence[hit_span[1]]
                # print nom_name_list[hit_idx], name
        # break
    f_wp.close()

    Mention.save_as_edl_file(mentions, dst_nom_mentions_file)
コード例 #8
0
ファイル: tactext.py プロジェクト: hldai/emadr-exp
def __name_expansion(edl_mentions_file, doc_ner_file, tokenized_text_file,
                     entity_candidates_dict_file, dst_file):
    mentions = Mention.load_edl_file(edl_mentions_file)
    __expand_name_with_ner_result(mentions, doc_ner_file)
    # __expand_location_names(mentions, tokenized_text_file, entity_candidates_dict_file)
    Mention.save_as_edl_file(mentions, dst_file)
コード例 #9
0
def all_to_all(edl_file, dst_edl_file):
    mentions = Mention.load_edl_file(edl_file)
    __assgin_different_id_to_all_nils(mentions)
    Mention.save_as_edl_file(mentions, dst_edl_file)