Beispiel #1
0
def create_kb_dict():
    if not os.path.exists(fileConfig.dir_kb_info):
        os.mkdir(fileConfig.dir_kb_info)
    kb_datas = [line for line in
                open(fileConfig.dir_data + fileConfig.file_kb_data, mode='r', encoding='utf-8').readlines()]
    kb_dict = {}
    for kb_data in tqdm(kb_datas, desc='init kb dict'):
        kb_data = ujson.loads(kb_data)
        subject_id = kb_data['subject_id']
        if subject_id in kb_dict:
            raise Exception('key : {} exist'.format(subject_id))
        # text = data_utils.get_text(kb_data['data'], kb_data['subject'])
        all_alias = {}
        subject = kb_data['subject']
        alias = kb_data['alias']
        all_alias = com_utils.dict_add(all_alias, subject)
        for alia in alias:
            alia_text = alia
            if all_alias.get(alia_text) is not None:
                continue
            all_alias = com_utils.dict_add(all_alias, alia_text)
        text = data_utils.get_all_text(kb_data['subject'], kb_data['data'])
        kb_dict[subject_id] = {'type': kb_data['type'], 'subject': subject, 'alias': list(all_alias),
                               'text': text}
    com_utils.pickle_save(kb_dict, fileConfig.dir_kb_info + fileConfig.file_kb_dict)
    print("create kb dict success")
Beispiel #2
0
def create_jieba_dict():
    data_file = open(fileConfig.dir_data + fileConfig.file_kb_data, mode='r', encoding='utf-8')
    com_utils.check_dir(fileConfig.dir_jieba)
    com_utils.check_dir(fileConfig.dir_kb_info)
    out_file = open(fileConfig.dir_jieba + fileConfig.file_jieba_dict, 'w', encoding='utf-8')
    words = {}
    for line in tqdm(data_file, desc='read file'):
        jstr = ujson.loads(line)
        subject = jstr['subject'].strip()
        words = com_utils.dict_add(words, subject)
        alias = jstr['alias']
        for item in alias:
            words = com_utils.dict_add(words, item.strip())
    # save jieba kb
    com_utils.pickle_save(words, fileConfig.dir_kb_info + fileConfig.file_jieba_kb)
    # find most common
    words = Counter(words).most_common()
    # save file
    save_str = ''
    count = 0
    for word in tqdm(words):
        save_str += word[0] + '\n'
        count += 1
        if count % 100 == 0:
            out_file.write(save_str)
            save_str = ''
    if len(save_str) > 0:
        print("write remid str")
        out_file.write(save_str)
    print("success build jieba dict")
Beispiel #3
0
def create_fasttext_sup_train_data(index, train_data_file, kb_dict_file, kb_alia_file, stopword_file, out_file,
                                   mode=fasttextConfig.create_data_word):
    print("create {} sup train data".format(index))
    kb_alias_df = pd.read_csv(kb_alia_file)
    stopwords = data_utils.get_stopword_list(stopword_file)
    train_datas = open(train_data_file, 'r', encoding='utf-8').readlines()
    kb_dict = com_utils.pickle_load(kb_dict_file)
    train_out_file = open(out_file, 'w', encoding='utf-8')
    text_ids = {}
    max_extend_countd = 3
    for line in tqdm(train_datas, desc='deal {} train file'.format(index)):
        jstr = ujson.loads(line)
        text = jstr['text']
        text_id = jstr['text_id']
        if text_ids.get(text_id) == max_extend_countd:
            continue
        mentions = jstr['mention_data']
        for mention in mentions:
            mention_id = mention['kb_id']
            mention_text = mention['mention']
            neighbor_text = com_utils.get_neighbor_sentence(text, mention_text)
            # true values
            kb_entity = kb_dict.get(mention_id)
            if kb_entity is not None:
                out_str = com_utils.get_entity_mention_pair_text(kb_entity['text'], neighbor_text, stopwords,
                                                                 cut_client,
                                                                 fasttextConfig.label_true, mode)
                train_out_file.write(out_str)
            # false values
            alia_ids = []
            alia_count = 0
            alias_df = kb_alias_df[kb_alias_df['subject'] == com_utils.cht_to_chs(mention_text)]
            for _, item in alias_df.iterrows():
                a_id = str(item['subject_id'])
                if a_id != mention_id:
                    alia_ids.append(a_id)
                    alia_count += 1
                    if alia_count == max_extend_countd:
                        break
            if len(alia_ids) > 0:
                for alia_id in alia_ids:
                    alia_entity = kb_dict.get(alia_id)
                    if alia_entity is not None:
                        out_str = com_utils.get_entity_mention_pair_text(alia_entity['text'], neighbor_text, stopwords,
                                                                         cut_client,
                                                                         fasttextConfig.label_false, mode)
                        train_out_file.write(out_str)
        # add text
        text_ids = com_utils.dict_add(text_ids, text_id)
    # 清理资源
    train_out_file.close()
    train_datas = None
    train_out_file = None
    kb_alias_df = None
    stopwords = None
    kb_dict = None
Beispiel #4
0
def get_result_error_list(gen_mentions, original_mentions, gen_more_dict):
    result_list = []
    gen_indexs = [0] * len(gen_mentions)
    original_indexs = [0] * len(original_mentions)
    # Traverse gen mentions
    for i, gen_mention in enumerate(gen_mentions):
        for j, original_mention in enumerate(original_mentions):
            if gen_mention['offset'] == original_mention[
                    'offset'] and gen_mention['kb_id'] == original_mention[
                        'kb_id']:
                gen_indexs[i] = 1
                original_indexs[j] = 1
                continue
            elif gen_mention['offset'] == original_mention[
                    'offset'] and gen_mention['kb_id'] != original_mention[
                        'kb_id']:
                gen_indexs[i] = 1
                original_indexs[j] = 1
                result_list.append({
                    'error_type': comConfig.result_error_type_miss,
                    'gen_mention': gen_mention,
                    'original_mention': original_mention
                })
                continue
        if gen_indexs[i] == 0:
            gen_indexs[i] = 1
            result_list.append({
                'error_type': comConfig.result_error_type_gen_more,
                'gen_mention': gen_mention
            })
            if len(gen_mention['mention']) > 1:
                com_utils.dict_add(gen_more_dict, gen_mention['mention'])
    # Traverse original mentions
    for i, value in enumerate(original_indexs):
        if value == 0 and original_mentions[i]['kb_id'] != 'NIL':
            result_list.append({
                'error_type': comConfig.result_error_type_original_more,
                'original_mention': original_mentions[i]
            })
    return result_list
Beispiel #5
0
def gen_random_select_list(entity_len_list, list_len):
    select_list = []
    select_dict = {}
    count = 1
    for i in range(len(entity_len_list)):
        count *= entity_len_list[i]
    # 生成三次,保证尽量生成全面
    for n in range(3):
        for i in range(count):
            items = []
            for j in range(list_len):
                items.append(random.randint(0, entity_len_list[j] - 1))
            if select_dict.get(items.__str__()) is None:
                select_list.append(items)
                select_dict = com_utils.dict_add(select_dict, items.__str__())
    return select_list
Beispiel #6
0
def analysis_test_ner_result():
    print("start analysis test ner result...")
    ner_test_datas = com_utils.pickle_load(
        fileConfig.dir_ner + fileConfig.file_ner_test_predict_tag)
    jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                       fileConfig.file_jieba_kb)
    out_file = open(fileConfig.dir_result +
                    fileConfig.file_ner_test_result_analysis,
                    'w',
                    encoding='utf-8')
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    gen_more_words = data_utils.get_stopword_list(
        fileConfig.dir_stopword + fileConfig.file_analysis_gen_more)
    text_id = 1
    for data in tqdm(ner_test_datas, 'find entity'):
        text = ''.join(data['text'])
        tag_list = data['tag']
        start_index = 0
        mention_length = 0
        is_find = False
        mentions = []
        type_dict = {}
        # use tag find
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg:
            if tag.find(nerConfig.B_seg) > -1 or (
                    tag.find(nerConfig.I_seg) > -1 and not is_find):
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length += 1
                is_find = True
            elif tag.find(nerConfig.E_seg) > -1 and not is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                # mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'T': 'NER',
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.I_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
            # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.E_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                # mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'T': 'NER',
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            elif tag == nerConfig.O_seg:
                is_find = False
                mention_length = 0
                type_dict = {}
        # use jieba find
        jieba_entities = cut_client.cut_text(text)
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg:
            # if tag.find(nerConfig.B_seg) > -1 or tag.find(nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1:
            jieba_offset = i
            jieba_char = text[i]
            jieba_text = get_jieba_mention(jieba_entities, jieba_char)
            if jieba_text is None:
                continue
            elif jieba_text == '_' or jieba_text == '-':
                continue
            elif len(jieba_text) == 1:
                continue
            elif stopwords.get(jieba_text) is not None:
                continue
            elif gen_more_words.get(jieba_text) is not None:
                continue
            jieba_offset = jieba_offset - jieba_text.find(jieba_char)
            if len(jieba_text) <= comConfig.max_jieba_cut_len and (
                    jieba_dict.get(jieba_text) is not None):
                type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O'
                if jieba_text is None:
                    continue
                if not is_already_find_mention(mentions, jieba_text,
                                               jieba_offset):
                    # jieba_offset = text.find(jieba_text)
                    mentions.append({
                        'T': 'JIEBA',
                        'mention': jieba_text,
                        'offset': str(jieba_offset),
                        'type': type_str
                    })
        # find inner brackets mentions
        bracket_mentions = data_utils.get_mention_inner_brackets(
            text, tag_list)
        for mention in bracket_mentions:
            mention['T'] = 'bracket'
        if len(bracket_mentions) > 0:
            mentions += bracket_mentions
        # completion mentions
        # mentions_com = []
        # for mention in mentions:
        #     mention_str = mention['mention']
        #     try:
        #         for find in re.finditer(mention_str, text):
        #             find_offset = find.span()[0]
        #             if find_offset != int(mention['offset']):
        #                 mentions_com.append(
        #                     {'T': 'COM', 'mention': mention['mention'], 'offset': str(find_offset),
        #                      'type': mention['type']})
        #     except BaseException:
        #         # print("occur error when match mention str in completion mentions, error value:{} text:{}".format(
        #         #     mention_str, text))
        #         pass
        #     mentions_com.append(mention)
        # mentions = mentions_com
        # completion mentions
        out_file.write('\n')
        result_str = ''
        for i in range(len(text)):
            result_str += text[i] + '-' + tag_list[i] + ' '
        out_file.write(' text_id:{}, text:{} '.format(text_id, result_str))
        out_file.write('\n')
        out_file.write(' gen_mentions:{} '.format(
            ujson.dumps(mentions, ensure_ascii=False)))
        out_file.write('\n')
        text_id += 1
Beispiel #7
0
def create_dev_mention_data(mode, ner_datas, out_file):
    ner_datas = com_utils.pickle_load(ner_datas)
    jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                       fileConfig.file_jieba_kb)
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    gen_more_words = data_utils.get_stopword_list(
        fileConfig.dir_stopword + fileConfig.file_analysis_gen_more)
    text_id = 1
    dev_mention_data = []
    # count = 0
    for data in tqdm(ner_datas, 'find entity'):
        # count += 1
        # if count < 1496:
        #     continue
        text = ''.join(data['text'])
        tag_list = data['tag']
        start_index = 0
        mention_length = 0
        is_find = False
        mentions = []
        type_dict = {}
        # use tag find
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg:
            if tag.find(nerConfig.B_seg) > -1 or (
                    tag.find(nerConfig.I_seg) > -1 and not is_find):
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length = 1
                is_find = True
            elif tag.find(nerConfig.E_seg) > -1 and not is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.I_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
            # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.E_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            elif tag == nerConfig.O_seg:
                is_find = False
                mention_length = 0
                type_dict = {}
        # use jieba find
        jieba_entities = cut_client.cut_text(text)
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg:
            if tag.find(nerConfig.B_seg) > -1 or tag.find(
                    nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1:
                jieba_offset = i
                jieba_char = text[i]
                jieba_text = get_jieba_mention(jieba_entities, jieba_char,
                                               jieba_offset)
                if jieba_text is None:
                    continue
                elif jieba_text == '_' or jieba_text == '-':
                    continue
                elif data_utils.is_punctuation(jieba_text):
                    continue
                elif len(jieba_text) == 1:
                    continue
                elif stopwords.get(jieba_text) is not None:
                    continue
                # elif gen_more_words.get(jieba_text) is not None:
                #     continue
                jieba_offset = jieba_offset - jieba_text.find(jieba_char)
                if len(jieba_text) <= comConfig.max_jieba_cut_len and (
                        jieba_dict.get(jieba_text) is not None):
                    type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O'
                    if jieba_text is None:
                        continue
                    if not is_already_find_mention(mentions, jieba_text,
                                                   jieba_offset):
                        mentions.append({
                            'mention': jieba_text,
                            'offset': str(jieba_offset),
                            'type': type_str
                        })
        # find inner brackets mentions
        bracket_mentions = data_utils.get_mention_inner_brackets(
            text, tag_list)
        if len(bracket_mentions) > 0:
            mentions += bracket_mentions
        # completion mentions
        # mentions_com = []
        # for mention in mentions:
        #     mention_str = mention['mention']
        #     try:
        #         for find in re.finditer(mention_str, text):
        #             find_offset = find.span()[0]
        #             if find_offset != int(mention['offset']):
        #                 mentions_com.append(
        #                     {'mention': mention['mention'], 'offset': str(find_offset), 'type': mention['type']})
        #     except BaseException:
        #         # print("occur error when match mention str in completion mentions, error value:{} text:{}".format(
        #         #     mention_str, text))
        #         pass
        #     mentions_com.append(mention)
        # mentions = mentions_com
        # optim mentions
        delete_mentions = []
        mentions.sort(key=get_mention_len)
        for mention in mentions:
            mention_offset = int(mention['offset'])
            mention_len = len(mention['mention'])
            for sub_mention in mentions:
                if mention_offset != int(sub_mention['offset']) and int(
                        sub_mention['offset']) in range(
                            mention_offset, mention_offset + mention_len):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
                if mention_offset == int(sub_mention['offset']) and len(
                        mention['mention']) > len(sub_mention['mention']):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
        if len(delete_mentions) > 0:
            change_mentions = []
            for mention in mentions:
                if not data_utils.is_mention_already_in_list(
                        delete_mentions, mention):
                    change_mentions.append(mention)
            mentions = change_mentions
        change_mentions = []
        for mention in mentions:
            if not data_utils.is_mention_already_in_list(
                    change_mentions, mention
            ) and mention['mention'] not in comConfig.punctuation:
                change_mentions.append(mention)
        mentions = change_mentions
        # optim mentions
        # sort mentions
        mentions.sort(key=get_offset)
        # optimize the mention data
        mentions_optim = []
        for mention in mentions:
            mentions_optim.append({
                'mention':
                get_optim_mention_text(jieba_entities, mention['mention']),
                'offset':
                mention['offset'],
                'type':
                mention['type']
            })
        if mode == 1:
            dev_mention_data.append({
                'text_id': str(text_id),
                'text': text,
                'mention_data': mentions_optim
            })
        elif mode == 2:
            dev_mention_data.append({
                'text_id':
                str(text_id),
                'text':
                text,
                'mention_data':
                mentions_optim,
                'mention_data_original':
                data['mention_data_original']
            })
        elif mode == 3:
            dev_mention_data.append({
                'text_id': str(text_id),
                'text': text,
                'mention_data': mentions_optim
            })
        text_id += 1
    com_utils.pickle_save(dev_mention_data, out_file)
    print("success create dev data with mentions, mode:{}".format(mode))