Exemple #1
0
def create_fasttext_sup_train_data(index, train_data_file, kb_dict_file, kb_alia_file, stopword_file, out_file,
                                   mode=fasttextConfig.create_data_word):
    print("create {} sup train data".format(index))
    kb_alias_df = pd.read_csv(kb_alia_file)
    stopwords = data_utils.get_stopword_list(stopword_file)
    train_datas = open(train_data_file, 'r', encoding='utf-8').readlines()
    kb_dict = com_utils.pickle_load(kb_dict_file)
    train_out_file = open(out_file, 'w', encoding='utf-8')
    text_ids = {}
    max_extend_countd = 3
    for line in tqdm(train_datas, desc='deal {} train file'.format(index)):
        jstr = ujson.loads(line)
        text = jstr['text']
        text_id = jstr['text_id']
        if text_ids.get(text_id) == max_extend_countd:
            continue
        mentions = jstr['mention_data']
        for mention in mentions:
            mention_id = mention['kb_id']
            mention_text = mention['mention']
            neighbor_text = com_utils.get_neighbor_sentence(text, mention_text)
            # true values
            kb_entity = kb_dict.get(mention_id)
            if kb_entity is not None:
                out_str = com_utils.get_entity_mention_pair_text(kb_entity['text'], neighbor_text, stopwords,
                                                                 cut_client,
                                                                 fasttextConfig.label_true, mode)
                train_out_file.write(out_str)
            # false values
            alia_ids = []
            alia_count = 0
            alias_df = kb_alias_df[kb_alias_df['subject'] == com_utils.cht_to_chs(mention_text)]
            for _, item in alias_df.iterrows():
                a_id = str(item['subject_id'])
                if a_id != mention_id:
                    alia_ids.append(a_id)
                    alia_count += 1
                    if alia_count == max_extend_countd:
                        break
            if len(alia_ids) > 0:
                for alia_id in alia_ids:
                    alia_entity = kb_dict.get(alia_id)
                    if alia_entity is not None:
                        out_str = com_utils.get_entity_mention_pair_text(alia_entity['text'], neighbor_text, stopwords,
                                                                         cut_client,
                                                                         fasttextConfig.label_false, mode)
                        train_out_file.write(out_str)
        # add text
        text_ids = com_utils.dict_add(text_ids, text_id)
    # 清理资源
    train_out_file.close()
    train_datas = None
    train_out_file = None
    kb_alias_df = None
    stopwords = None
    kb_dict = None
Exemple #2
0
def analysis_test_ner_result():
    print("start analysis test ner result...")
    ner_test_datas = com_utils.pickle_load(
        fileConfig.dir_ner + fileConfig.file_ner_test_predict_tag)
    jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                       fileConfig.file_jieba_kb)
    out_file = open(fileConfig.dir_result +
                    fileConfig.file_ner_test_result_analysis,
                    'w',
                    encoding='utf-8')
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    gen_more_words = data_utils.get_stopword_list(
        fileConfig.dir_stopword + fileConfig.file_analysis_gen_more)
    text_id = 1
    for data in tqdm(ner_test_datas, 'find entity'):
        text = ''.join(data['text'])
        tag_list = data['tag']
        start_index = 0
        mention_length = 0
        is_find = False
        mentions = []
        type_dict = {}
        # use tag find
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg:
            if tag.find(nerConfig.B_seg) > -1 or (
                    tag.find(nerConfig.I_seg) > -1 and not is_find):
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length += 1
                is_find = True
            elif tag.find(nerConfig.E_seg) > -1 and not is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                # mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'T': 'NER',
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.I_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
            # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.E_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                # mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'T': 'NER',
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            elif tag == nerConfig.O_seg:
                is_find = False
                mention_length = 0
                type_dict = {}
        # use jieba find
        jieba_entities = cut_client.cut_text(text)
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg:
            # if tag.find(nerConfig.B_seg) > -1 or tag.find(nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1:
            jieba_offset = i
            jieba_char = text[i]
            jieba_text = get_jieba_mention(jieba_entities, jieba_char)
            if jieba_text is None:
                continue
            elif jieba_text == '_' or jieba_text == '-':
                continue
            elif len(jieba_text) == 1:
                continue
            elif stopwords.get(jieba_text) is not None:
                continue
            elif gen_more_words.get(jieba_text) is not None:
                continue
            jieba_offset = jieba_offset - jieba_text.find(jieba_char)
            if len(jieba_text) <= comConfig.max_jieba_cut_len and (
                    jieba_dict.get(jieba_text) is not None):
                type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O'
                if jieba_text is None:
                    continue
                if not is_already_find_mention(mentions, jieba_text,
                                               jieba_offset):
                    # jieba_offset = text.find(jieba_text)
                    mentions.append({
                        'T': 'JIEBA',
                        'mention': jieba_text,
                        'offset': str(jieba_offset),
                        'type': type_str
                    })
        # find inner brackets mentions
        bracket_mentions = data_utils.get_mention_inner_brackets(
            text, tag_list)
        for mention in bracket_mentions:
            mention['T'] = 'bracket'
        if len(bracket_mentions) > 0:
            mentions += bracket_mentions
        # completion mentions
        # mentions_com = []
        # for mention in mentions:
        #     mention_str = mention['mention']
        #     try:
        #         for find in re.finditer(mention_str, text):
        #             find_offset = find.span()[0]
        #             if find_offset != int(mention['offset']):
        #                 mentions_com.append(
        #                     {'T': 'COM', 'mention': mention['mention'], 'offset': str(find_offset),
        #                      'type': mention['type']})
        #     except BaseException:
        #         # print("occur error when match mention str in completion mentions, error value:{} text:{}".format(
        #         #     mention_str, text))
        #         pass
        #     mentions_com.append(mention)
        # mentions = mentions_com
        # completion mentions
        out_file.write('\n')
        result_str = ''
        for i in range(len(text)):
            result_str += text[i] + '-' + tag_list[i] + ' '
        out_file.write(' text_id:{}, text:{} '.format(text_id, result_str))
        out_file.write('\n')
        out_file.write(' gen_mentions:{} '.format(
            ujson.dumps(mentions, ensure_ascii=False)))
        out_file.write('\n')
        text_id += 1
Exemple #3
0
def create_dev_mention_data(mode, ner_datas, out_file):
    ner_datas = com_utils.pickle_load(ner_datas)
    jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                       fileConfig.file_jieba_kb)
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    gen_more_words = data_utils.get_stopword_list(
        fileConfig.dir_stopword + fileConfig.file_analysis_gen_more)
    text_id = 1
    dev_mention_data = []
    # count = 0
    for data in tqdm(ner_datas, 'find entity'):
        # count += 1
        # if count < 1496:
        #     continue
        text = ''.join(data['text'])
        tag_list = data['tag']
        start_index = 0
        mention_length = 0
        is_find = False
        mentions = []
        type_dict = {}
        # use tag find
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg:
            if tag.find(nerConfig.B_seg) > -1 or (
                    tag.find(nerConfig.I_seg) > -1 and not is_find):
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length = 1
                is_find = True
            elif tag.find(nerConfig.E_seg) > -1 and not is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.I_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
            # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.E_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            elif tag == nerConfig.O_seg:
                is_find = False
                mention_length = 0
                type_dict = {}
        # use jieba find
        jieba_entities = cut_client.cut_text(text)
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg:
            if tag.find(nerConfig.B_seg) > -1 or tag.find(
                    nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1:
                jieba_offset = i
                jieba_char = text[i]
                jieba_text = get_jieba_mention(jieba_entities, jieba_char,
                                               jieba_offset)
                if jieba_text is None:
                    continue
                elif jieba_text == '_' or jieba_text == '-':
                    continue
                elif data_utils.is_punctuation(jieba_text):
                    continue
                elif len(jieba_text) == 1:
                    continue
                elif stopwords.get(jieba_text) is not None:
                    continue
                # elif gen_more_words.get(jieba_text) is not None:
                #     continue
                jieba_offset = jieba_offset - jieba_text.find(jieba_char)
                if len(jieba_text) <= comConfig.max_jieba_cut_len and (
                        jieba_dict.get(jieba_text) is not None):
                    type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O'
                    if jieba_text is None:
                        continue
                    if not is_already_find_mention(mentions, jieba_text,
                                                   jieba_offset):
                        mentions.append({
                            'mention': jieba_text,
                            'offset': str(jieba_offset),
                            'type': type_str
                        })
        # find inner brackets mentions
        bracket_mentions = data_utils.get_mention_inner_brackets(
            text, tag_list)
        if len(bracket_mentions) > 0:
            mentions += bracket_mentions
        # completion mentions
        # mentions_com = []
        # for mention in mentions:
        #     mention_str = mention['mention']
        #     try:
        #         for find in re.finditer(mention_str, text):
        #             find_offset = find.span()[0]
        #             if find_offset != int(mention['offset']):
        #                 mentions_com.append(
        #                     {'mention': mention['mention'], 'offset': str(find_offset), 'type': mention['type']})
        #     except BaseException:
        #         # print("occur error when match mention str in completion mentions, error value:{} text:{}".format(
        #         #     mention_str, text))
        #         pass
        #     mentions_com.append(mention)
        # mentions = mentions_com
        # optim mentions
        delete_mentions = []
        mentions.sort(key=get_mention_len)
        for mention in mentions:
            mention_offset = int(mention['offset'])
            mention_len = len(mention['mention'])
            for sub_mention in mentions:
                if mention_offset != int(sub_mention['offset']) and int(
                        sub_mention['offset']) in range(
                            mention_offset, mention_offset + mention_len):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
                if mention_offset == int(sub_mention['offset']) and len(
                        mention['mention']) > len(sub_mention['mention']):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
        if len(delete_mentions) > 0:
            change_mentions = []
            for mention in mentions:
                if not data_utils.is_mention_already_in_list(
                        delete_mentions, mention):
                    change_mentions.append(mention)
            mentions = change_mentions
        change_mentions = []
        for mention in mentions:
            if not data_utils.is_mention_already_in_list(
                    change_mentions, mention
            ) and mention['mention'] not in comConfig.punctuation:
                change_mentions.append(mention)
        mentions = change_mentions
        # optim mentions
        # sort mentions
        mentions.sort(key=get_offset)
        # optimize the mention data
        mentions_optim = []
        for mention in mentions:
            mentions_optim.append({
                'mention':
                get_optim_mention_text(jieba_entities, mention['mention']),
                'offset':
                mention['offset'],
                'type':
                mention['type']
            })
        if mode == 1:
            dev_mention_data.append({
                'text_id': str(text_id),
                'text': text,
                'mention_data': mentions_optim
            })
        elif mode == 2:
            dev_mention_data.append({
                'text_id':
                str(text_id),
                'text':
                text,
                'mention_data':
                mentions_optim,
                'mention_data_original':
                data['mention_data_original']
            })
        elif mode == 3:
            dev_mention_data.append({
                'text_id': str(text_id),
                'text': text,
                'mention_data': mentions_optim
            })
        text_id += 1
    com_utils.pickle_save(dev_mention_data, out_file)
    print("success create dev data with mentions, mode:{}".format(mode))
def eval_sup(mode=fasttextConfig.create_data_word):
    print("start use the fasttext/supervised model to predict eval data")
    if not os.path.exists(fileConfig.dir_result):
        os.mkdir(fileConfig.dir_result)
    # unsup_model = fastText.load_model(
    #     fileConfig.dir_fasttext + fileConfig.file_fasttext_model.format(fasttextConfig.model_skipgram))
    unsup_model = word2vec.Word2VecKeyedVectors.load(
        fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model)
    sup_model = fastText.load_model(fileConfig.dir_fasttext +
                                    fileConfig.file_fasttext_sup_word_model)
    kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                    fileConfig.file_kb_dict)
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_eval_cands_data,
                    'r',
                    encoding='utf-8')
    out_file = open(fileConfig.dir_result + fileConfig.file_result_eval_data,
                    'w',
                    encoding='utf-8')
    # entity diambiguation
    for line in tqdm(dev_file, 'entity diambiguation'):
        if len(line.strip('\n')) == 0:
            continue
        jstr = ujson.loads(line)
        dev_entity = {}
        text = com_utils.cht_to_chs(jstr['text'].lower())
        dev_entity['text_id'] = jstr['text_id']
        dev_entity['text'] = jstr['text']
        mention_data = jstr['mention_data']
        mentions = []
        for mention in mention_data:
            mention_text = mention['mention']
            if mention_text is None:
                continue
            cands = mention['cands']
            if len(cands) == 0:
                continue
            # use supervised model to choose mention
            supervise_cands = []
            for cand in cands:
                neighbor_text = com_utils.get_neighbor_sentence(
                    text, com_utils.cht_to_chs(mention_text.lower()))
                cand_entity = kb_dict.get(cand['cand_id'])
                if cand_entity is not None:
                    out_str = com_utils.get_entity_mention_pair_text(
                        com_utils.cht_to_chs(cand_entity['text'].lower()),
                        neighbor_text,
                        stopwords,
                        cut_client,
                        mode=mode)
                    result = sup_model.predict(out_str.strip('\n'))[0][0]
                    if result == fasttextConfig.label_true:
                        supervise_cands.append(cand)
            if len(supervise_cands) == 0:
                supervise_cands = cands
            # unsupervise model choose item
            max_cand = None
            # score list
            score_list = []
            mention_neighbor_sentence = text
            for i, cand in enumerate(supervise_cands):
                # score = fasttext_get_sim(unsup_model, mention_neighbor_sentence,
                #                          com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords)
                score = gensim_get_sim(
                    unsup_model, mention_neighbor_sentence,
                    com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords)
                if score < fasttextConfig.min_entity_similarity_threshold:
                    continue
                score_list.append({
                    'cand_id': cand['cand_id'],
                    'cand_score': score,
                    'cand_type': cand['cand_type']
                })
            score_list.sort(key=get_socre_key, reverse=True)
            if len(score_list) > 0:
                max_cand = score_list[0]
            # find the best cand
            if max_cand is not None:
                mentions.append({
                    'kb_id': max_cand['cand_id'],
                    'mention': mention['mention'],
                    'offset': mention['offset']
                })
        # optim mentions
        delete_mentions = []
        mentions.sort(key=get_mention_len)
        for optim_mention in mentions:
            mention_offset = int(optim_mention['offset'])
            mention_len = len(optim_mention['mention'])
            for sub_mention in mentions:
                if mention_offset != int(sub_mention['offset']) and int(
                        sub_mention['offset']) in range(
                            mention_offset, mention_offset + mention_len):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
        if len(delete_mentions) > 0:
            change_mentions = []
            for optim_mention in mentions:
                if not data_utils.is_mention_already_in_list(
                        delete_mentions, optim_mention):
                    change_mentions.append(optim_mention)
            mentions = change_mentions
        change_mentions = []
        for optim_mention in mentions:
            if not data_utils.is_mention_already_in_list(
                    change_mentions, optim_mention
            ) and optim_mention['mention'] not in comConfig.punctuation:
                change_mentions.append(optim_mention)
        mentions = change_mentions
        mentions.sort(key=get_mention_offset)
        dev_entity['mention_data'] = mentions
        out_file.write(ujson.dumps(dev_entity, ensure_ascii=False))
        out_file.write('\n')
    print("success create supervised eval result")
def predict():
    print("start use the fasttext model to predict dev data")
    if not os.path.exists(fileConfig.dir_result):
        os.mkdir(fileConfig.dir_result)
    model = fastText.load_model(
        fileConfig.dir_fasttext +
        fileConfig.file_fasttext_model.format(fasttextConfig.choose_model))
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_dev_cands_data,
                    'r',
                    encoding='utf-8')
    out_file = open(fileConfig.dir_result +
                    fileConfig.file_result_fasttext_predict,
                    'w',
                    encoding='utf-8')
    # entity diambiguation
    for line in tqdm(dev_file, 'entity diambiguation'):
        jstr = ujson.loads(line)
        dev_entity = {}
        text = jstr['text']
        dev_entity['text_id'] = jstr['text_id']
        dev_entity['text'] = jstr['text']
        mention_data = jstr['mention_data']
        mentions = []
        for mention in mention_data:
            mention_text = mention['mention']
            cands = mention['cands']
            if len(cands) == 0:
                continue
            if len(cands) == 1:
                mentions.append({
                    'kb_id': str(cands[0]['cand_id']),
                    'mention': mention['mention'],
                    'offset': str(mention['offset'])
                })
                continue
            max_index = 0
            max_score = 0.0
            # mention_neighbor_sentence = get_neighbor_sentence(text, mention_text)
            mention_neighbor_sentence = text
            for i, cand in enumerate(cands):
                score = fasttext_get_sim(model, mention_neighbor_sentence,
                                         cand['cand_text'], stopwords)
                if score > max_score:
                    max_score = score
                    max_index = i
            if max_score < fasttextConfig.min_entity_similarity_threshold:
                continue
            mentions.append({
                'kb_id': cands[max_index]['cand_id'],
                'mention': mention['mention'],
                'offset': mention['offset']
            })
        # filter mentions
        choose_offset = {}
        for i, mention in enumerate(mentions):
            mention_offset = mention['offset']
            if choose_offset.get(mention_offset) is not None:
                if len(choose_offset.get(mention_offset).split('-')[1]) < len(
                        mention['mention']):
                    choose_offset[mention_offset] = str(
                        i) + '-' + mention['mention']
            else:
                choose_offset[mention_offset] = str(
                    i) + '-' + mention['mention']
        choose_mentions = []
        for key, value in choose_offset.items():
            choose_mentions.append(mentions[int(value.split('-')[0])])
        dev_entity['mention_data'] = choose_mentions
        out_file.write(ujson.dumps(dev_entity, ensure_ascii=False))
        out_file.write('\n')
    print("success create predict result")
def test_sup(mode=fasttextConfig.create_data_word):
    print("start use the fasttext model/supervise model to predict test data")
    if not os.path.exists(fileConfig.dir_result):
        os.mkdir(fileConfig.dir_result)
    unsup_model_fasttext = fastText.load_model(
        fileConfig.dir_fasttext +
        fileConfig.file_fasttext_model.format(fasttextConfig.choose_model))
    unsup_model_gensim = word2vec.Word2VecKeyedVectors.load(
        fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model)
    sup_model = fastText.load_model(fileConfig.dir_fasttext +
                                    fileConfig.file_fasttext_sup_word_model)
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                    fileConfig.file_kb_dict)
    dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_test_cands_data,
                    'r',
                    encoding='utf-8')
    out_file = open(fileConfig.dir_result +
                    fileConfig.file_result_fasttext_test,
                    'w',
                    encoding='utf-8')
    # f1 parmas
    gen_mention_count = 0
    original_mention_count = 0
    correct_mention_count = 0
    # count = 0
    # entity diambiguation
    for line in tqdm(dev_file, 'entity diambiguation'):
        # count += 1
        # if count < 3456:
        #     continue
        jstr = ujson.loads(line)
        dev_entity = {}
        text = com_utils.cht_to_chs(jstr['text'].lower())
        dev_entity['text_id'] = jstr['text_id']
        dev_entity['text'] = jstr['text']
        mention_data = jstr['mention_data']
        original_mention_data = jstr['mention_data_original']
        mentions = []
        for mention in mention_data:
            mention_text = mention['mention']
            if mention_text is None:
                continue
            cands = mention['cands']
            if len(cands) == 0:
                continue
            # use supervised model to choose mention
            supervise_cands = []
            for cand in cands:
                neighbor_text = com_utils.get_neighbor_sentence(
                    text, com_utils.cht_to_chs(mention_text.lower()))
                cand_entity = kb_dict.get(cand['cand_id'])
                if cand_entity is not None:
                    out_str = com_utils.get_entity_mention_pair_text(
                        com_utils.cht_to_chs(cand_entity['text'].lower()),
                        neighbor_text,
                        stopwords,
                        cut_client,
                        mode=mode)
                    # print(out_str)
                    result = sup_model.predict(out_str.replace('\n',
                                                               ' '))[0][0]
                    if result == fasttextConfig.label_true:
                        supervise_cands.append(cand)
            # unsupervise model choose item
            max_cand = None
            if len(supervise_cands) == 0:
                supervise_cands = cands
            # score list
            score_list = []
            mention_neighbor_sentence = text
            for i, cand in enumerate(supervise_cands):
                # score_fasttext = fasttext_get_sim(unsup_model_fasttext, mention_neighbor_sentence,
                #                          com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords)
                score_gensim = gensim_get_sim(
                    unsup_model_gensim, mention_neighbor_sentence,
                    com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords)
                # score = (0.8 * score_gensim) + (0.2 * score_fasttext)
                score = score_gensim
                # if score > max_score:
                #     max_score = score
                #     max_index = score
                if score < fasttextConfig.min_entity_similarity_threshold:
                    continue
                score_list.append({
                    'cand_id': cand['cand_id'],
                    'cand_score': score,
                    'cand_type': cand['cand_type']
                })
            # if max_score < fasttextConfig.min_entity_similarity_threshold:
            #     continue
            # find the best cand
            # find_type = False
            score_list.sort(key=get_socre_key, reverse=True)
            # for item in score_list:
            #     if item['cand_type'] == mention['type']:
            #         find_type = True
            # if find_type:
            #     for item in score_list:
            #         if item['cand_score'] > fasttextConfig.choose_entity_similarity_threshold:
            #             max_cand = item
            if max_cand is None:
                if len(score_list) > 0:
                    max_cand = score_list[0]
            # find the best cand
            if max_cand is not None:
                mentions.append({
                    'kb_id': max_cand['cand_id'],
                    'mention': mention['mention'],
                    'offset': mention['offset']
                })
        # optim mentions
        delete_mentions = []
        mentions.sort(key=get_mention_len)
        for mention in mentions:
            mention_offset = int(mention['offset'])
            mention_len = len(mention['mention'])
            for sub_mention in mentions:
                if mention_offset != int(sub_mention['offset']) and int(
                        sub_mention['offset']) in range(
                            mention_offset, mention_offset + mention_len):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
                if mention_offset == int(sub_mention['offset']) and len(
                        mention['mention']) > len(sub_mention['mention']):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
        if len(delete_mentions) > 0:
            change_mentions = []
            for mention in mentions:
                if not data_utils.is_mention_already_in_list(
                        delete_mentions, mention):
                    change_mentions.append(mention)
            mentions = change_mentions
        change_mentions = []
        for mention in mentions:
            if not data_utils.is_mention_already_in_list(
                    change_mentions, mention
            ) and mention['mention'] not in comConfig.punctuation:
                change_mentions.append(mention)
        mentions = change_mentions
        mentions.sort(key=get_mention_offset)
        # optim mentions
        # calc f1
        for mention in mentions:
            if is_find_correct_entity(mention['kb_id'], original_mention_data):
                correct_mention_count += 1
        gen_mention_count += len(mentions)
        for orginal_mention in original_mention_data:
            if orginal_mention['kb_id'] != 'NIL':
                original_mention_count += 1
        # out result
        dev_entity['mention_data'] = mentions
        dev_entity['mention_data_original'] = original_mention_data
        out_file.write(ujson.dumps(dev_entity, ensure_ascii=False))
        out_file.write('\n')
    precision = correct_mention_count / gen_mention_count
    recall = correct_mention_count / original_mention_count
    f1 = 2 * precision * recall / (precision + recall)
    print("success create test result, p:{:.4f} r:{:.4f} f1:{:.4f}".format(
        precision, recall, f1))
def test():
    print("start use the fasttext model to predict test data")
    if not os.path.exists(fileConfig.dir_result):
        os.mkdir(fileConfig.dir_result)
    model = fastText.load_model(
        fileConfig.dir_fasttext +
        fileConfig.file_fasttext_model.format(fasttextConfig.choose_model))
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                    fileConfig.file_kb_dict)
    dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_test_cands_data,
                    'r',
                    encoding='utf-8')
    out_file = open(fileConfig.dir_result +
                    fileConfig.file_result_fasttext_test,
                    'w',
                    encoding='utf-8')
    # f1 parmas
    gen_mention_count = 0
    original_mention_count = 0
    correct_mention_count = 0
    # entity diambiguation
    for line in tqdm(dev_file, 'entity diambiguation'):
        jstr = ujson.loads(line)
        dev_entity = {}
        text = jstr['text']
        dev_entity['text_id'] = jstr['text_id']
        dev_entity['text'] = jstr['text']
        mention_data = jstr['mention_data']
        original_mention_data = jstr['mention_data_original']
        mentions = []
        for mention in mention_data:
            mention_text = mention['mention']
            cands = mention['cands']
            if len(cands) == 0:
                continue
            # if len(cands) == 1:
            #     mentions.append(
            #         {'kb_id': str(cands[0]['cand_id']), 'mention': mention['mention'],
            #          'offset': str(mention['offset'])})
            #     continue
            max_index = 0
            max_score = 0.0
            max_cand = None
            # mention_neighbor_sentence = get_neighbor_sentence(text, mention_text)
            # score list
            score_list = []
            mention_neighbor_sentence = text
            for i, cand in enumerate(cands):
                score = fasttext_get_sim(model, mention_neighbor_sentence,
                                         cand['cand_text'], stopwords)
                # if score > max_score:
                #     max_score = score
                #     max_index = i
                if score < fasttextConfig.min_entity_similarity_threshold:
                    continue
                score_list.append({
                    'cand_id': cand['cand_id'],
                    'cand_score': score,
                    'cand_type': cand['cand_type']
                })
            # if max_score < fasttextConfig.min_entity_similarity_threshold:
            #     continue
            # find the best cand
            find_type = False
            score_list.sort(key=get_socre_key, reverse=True)
            for item in score_list:
                if item['cand_type'] == mention['type']:
                    find_type = True
            if find_type:
                for item in score_list:
                    if item['cand_score'] > fasttextConfig.choose_entity_similarity_threshold:
                        max_cand = item
            if max_cand is None:
                if len(score_list) > 0:
                    max_cand = score_list[0]
            # find the best cand
            if max_cand is not None:
                if is_find_correct_entity(max_cand['cand_id'],
                                          original_mention_data):
                    correct_mention_count += 1
                mentions.append({
                    'kb_id': max_cand['cand_id'],
                    'mention': mention['mention'],
                    'offset': mention['offset']
                })
        # calc f1 params
        gen_mention_count += len(mentions)
        original_mention_count += len(original_mention_data)

        dev_entity['mention_data'] = mentions
        dev_entity['mention_data_original'] = original_mention_data
        out_file.write('-' * 20)
        out_file.write('\n')
        out_file.write("text_id:{}--text:{}".format(dev_entity['text_id'],
                                                    dev_entity['text']))
        out_file.write('\n')
        out_file.write("mention_data:")
        out_file.write('\n')
        # generate mention
        for mention in dev_entity['mention_data']:
            kb_mention = ''
            if mention['kb_id'] != 'NIL':
                kb_mention = ujson.dumps(kb_dict[mention['kb_id']],
                                         ensure_ascii=False)
            out_file.write('*' * 20)
            out_file.write('\n')
            out_file.write('mention_original: {}'.format(mention))
            out_file.write('\n')
            out_file.write("kb: {}".format(kb_mention))
            out_file.write('\n')
            out_file.write('*' * 20)
            out_file.write('\n')
        # original mention
        out_file.write("kb_data:")
        out_file.write('\n')
        for mention in dev_entity['mention_data_original']:
            kb_mention = ''
            if mention['kb_id'] != 'NIL':
                kb_mention = ujson.dumps(kb_dict[mention['kb_id']],
                                         ensure_ascii=False)
            out_file.write('*' * 20)
            out_file.write('\n')
            out_file.write('kb_original: {}'.format(mention))
            out_file.write('\n')
            out_file.write("kb: {}".format(kb_mention))
            out_file.write('\n')
            out_file.write('*' * 20)
            out_file.write('\n')
        out_file.write('-' * 20)
        out_file.write('\n')
    precision = correct_mention_count / gen_mention_count
    recall = correct_mention_count / original_mention_count
    f1 = 2 * precision * recall / (precision + recall)
    print("success create test result, p:{:.4f} r:{:.4f} f1:{:.4f}".format(
        precision, recall, f1))
Exemple #8
0
def create_fasttext_unsup_train_data():
    print("start create unsup fasttext data...")
    if not os.path.exists(fileConfig.dir_fasttext):
        os.mkdir(fileConfig.dir_fasttext)
    kb_datas = open(fileConfig.dir_data + fileConfig.file_kb_data, 'r', encoding='utf-8')
    train_datas = open(fileConfig.dir_data + fileConfig.file_train_data, 'r', encoding='utf-8')
    dev_datas = open(fileConfig.dir_data + fileConfig.file_dev_data, 'r', encoding='utf-8')
    out_file = open(fileConfig.dir_fasttext + fileConfig.file_fasttext_unsup_train_data, 'w', encoding='utf-8')
    stopword_list = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword)
    print("prepare train data")
    train_sentence = []
    # kb data
    for line in tqdm(kb_datas, desc='deal kb data'):
        jstr = ujson.loads(line)
        train_sentence.append(data_utils.get_kb_text(jstr, cut_client, stopword_list))
    # train data
    for line in tqdm(train_datas, desc='deal train data'):
        jstr = ujson.loads(line)
        text = jstr['text']
        text_len = len(text)
        save_str = ''
        str_point = 0
        mention_datas = jstr['mention_data']
        for mention in mention_datas:
            mention_offset = int(mention['offset'])
            mention_text = mention['mention']
            sub_text = text[str_point:mention_offset]
            cut_texts = cut_client.cut_text(sub_text)
            for s_text in cut_texts:
                if s_text != ' ':
                    save_str += com_utils.cht_to_chs(s_text)
                    if not s_text.isdigit():
                        save_str += ' '
            if len(sub_text) > 0 and not sub_text.isdigit():
                save_str += ' '
            str_point += mention_offset - str_point
            save_str += mention_text
            mention_text_len = len(mention_text)
            if mention_text_len > 0 and not mention_text.isdigit():
                save_str += ' '
            str_point += mention_text_len
        if str_point < text_len:
            sub_text = text[str_point:text_len]
            cut_texts = cut_client.cut_text(sub_text)
            for s_text in cut_texts:
                if s_text != ' ':
                    save_str += com_utils.cht_to_chs(s_text)
                    if not s_text.isdigit():
                        save_str += ' '
        train_sentence.append(save_str)
    # # dev data
    for line in tqdm(dev_datas, desc='deal dev data'):
        jstr = ujson.loads(line)
        text_list = cut_client.cut_text(jstr['text'].lower())
        save_str = ''
        for dev_text in text_list:
            if dev_text != ' ':
                save_str += com_utils.cht_to_chs(dev_text)
                if not dev_text.isdigit():
                    save_str += ' '
        train_sentence.append(save_str)
    line_len = len(train_sentence)
    print("save train data, data len:{}".format(line_len))
    for i, line in enumerate(train_sentence):
        if i < line_len - 1:
            out_file.writelines(line)
            out_file.write('\n')
        else:
            out_file.writelines(line)
    print("success save fasttext train file")