コード例 #1
0
def create_kb_dict():
    if not os.path.exists(fileConfig.dir_kb_info):
        os.mkdir(fileConfig.dir_kb_info)
    kb_datas = [line for line in
                open(fileConfig.dir_data + fileConfig.file_kb_data, mode='r', encoding='utf-8').readlines()]
    kb_dict = {}
    for kb_data in tqdm(kb_datas, desc='init kb dict'):
        kb_data = ujson.loads(kb_data)
        subject_id = kb_data['subject_id']
        if subject_id in kb_dict:
            raise Exception('key : {} exist'.format(subject_id))
        # text = data_utils.get_text(kb_data['data'], kb_data['subject'])
        all_alias = {}
        subject = kb_data['subject']
        alias = kb_data['alias']
        all_alias = com_utils.dict_add(all_alias, subject)
        for alia in alias:
            alia_text = alia
            if all_alias.get(alia_text) is not None:
                continue
            all_alias = com_utils.dict_add(all_alias, alia_text)
        text = data_utils.get_all_text(kb_data['subject'], kb_data['data'])
        kb_dict[subject_id] = {'type': kb_data['type'], 'subject': subject, 'alias': list(all_alias),
                               'text': text}
    com_utils.pickle_save(kb_dict, fileConfig.dir_kb_info + fileConfig.file_kb_dict)
    print("create kb dict success")
コード例 #2
0
def deal_ner_predict_data(predict_list, data_list, out_file):
    # init label map
    id2label = {i: label for i, label in enumerate(nerConfig.labels)}
    # init predict list
    label_list = []
    for labels in predict_list:
        for label in labels:
            item = label[label != -1]
            label_list.append(item)
    datas = []
    for data, label in zip(data_list, label_list):
        text_list = data.text_a
        if len(text_list) != len(label):
            print('text len:{} labels len:{}'.format(len(text_list),
                                                     len(label)))
        assert len(text_list) == len(label)
        labels = []
        for item in label:
            labels.append(id2label.get(item.item()))
        datas.append({
            'text': text_list,
            'tag': labels,
            'mention_data_original': data.mention_data
        })
    com_utils.pickle_save(datas, out_file)
コード例 #3
0
def create_nel_train_data():
    if not os.path.exists(fileConfig.dir_nel):
        os.mkdir(fileConfig.dir_nel)
    train_data = open(fileConfig.dir_data + fileConfig.file_train_data, 'r')
    kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict)
    pd_df = pd.read_csv(fileConfig.dir_kb_info + fileConfig.file_kb_pandas_csv)
    data_list = []
    for line in tqdm(train_data, desc='create entity link train data'):
        # for line in train_data:
        jstr = ujson.loads(line)
        text_id = jstr['text_id']
        text = jstr['text']
        mention_datas = jstr['mention_data']
        for mention_data in mention_datas:
            kb_id = mention_data['kb_id']
            mention = mention_data['mention']
            start = mention_data['offset']
            end = int(start) + len(mention) - 1
            kb_entity = kb_dict.get(kb_id)
            if kb_entity is not None:
                entity_cands, entity_ids, entity_text = data_utils.get_entity_cands(kb_entity, kb_id, pd_df)
            else:
                continue
            data_list.append({'text_id': text_id, 'mention_text': text, 'mention': mention,
                              'mention_position': [start, end], 'entity_cands': entity_cands,
                              'entity_text': entity_text, 'entity_ids': entity_ids})
    com_utils.pickle_save(data_list, fileConfig.dir_nel + fileConfig.file_nel_entity_link_train_data)
    print("success create nel entity link train data")
コード例 #4
0
def create_jieba_dict():
    data_file = open(fileConfig.dir_data + fileConfig.file_kb_data, mode='r', encoding='utf-8')
    com_utils.check_dir(fileConfig.dir_jieba)
    com_utils.check_dir(fileConfig.dir_kb_info)
    out_file = open(fileConfig.dir_jieba + fileConfig.file_jieba_dict, 'w', encoding='utf-8')
    words = {}
    for line in tqdm(data_file, desc='read file'):
        jstr = ujson.loads(line)
        subject = jstr['subject'].strip()
        words = com_utils.dict_add(words, subject)
        alias = jstr['alias']
        for item in alias:
            words = com_utils.dict_add(words, item.strip())
    # save jieba kb
    com_utils.pickle_save(words, fileConfig.dir_kb_info + fileConfig.file_jieba_kb)
    # find most common
    words = Counter(words).most_common()
    # save file
    save_str = ''
    count = 0
    for word in tqdm(words):
        save_str += word[0] + '\n'
        count += 1
        if count % 100 == 0:
            out_file.write(save_str)
            save_str = ''
    if len(save_str) > 0:
        print("write remid str")
        out_file.write(save_str)
    print("success build jieba dict")
コード例 #5
0
def create_ner_data(train_file_path=None, out_file_path=None):
    if not os.path.exists(fileConfig.dir_ner):
        os.mkdir(fileConfig.dir_ner)
    train_file = open(train_file_path, mode='r', encoding='utf-8')
    data_list = []
    for i, line in tqdm(enumerate(train_file), desc='create ner data'):
        jstr = ujson.loads(line)
        text_id = jstr['text_id']
        # print(text_id)
        text_list = list(jstr['text'])
        mentions = jstr['mention_data']
        text_len = len(text_list)
        tag_list = [nerConfig.O_seg] * text_len
        for mention in mentions:
            kb_id = mention['kb_id']
            if kb_id == 'NIL':
                continue
            mention_len = len(mention['mention'])
            offset = int(mention['offset'])
            # tag = nerConfig.NIL_seg if mention['kb_id'] == nerConfig.NIL_seg else nerConfig.KB_seg
            tag = nerConfig.KB_seg
            # tag = com_utils.get_kb_type(kb_dict[kb_id]['type'])
            # tag B
            tag_list[offset] = nerConfig.B_seg + tag
            if mention_len == 1:
                continue
            # tag I
            for i in range(offset + 1, offset + mention_len - 1):
                tag_list[i] = nerConfig.I_seg + tag
            # tag E
            tag_list[offset + mention_len - 1] = nerConfig.E_seg + tag
        data_list.append({'id': text_id, 'text': text_list, 'tag': tag_list, 'mention_data': mentions})
    com_utils.pickle_save(data_list, out_file_path)
    print("success create ner data")
コード例 #6
0
def split_eval_mention(num):
    dev_mention_data = com_utils.pickle_load(
        fileConfig.dir_ner + fileConfig.file_ner_eval_mention_data)
    data_len = len(dev_mention_data)
    block_size = data_len / num
    for i in range(1, num + 1):
        data_iter = dev_mention_data[int((i - 1) * block_size):int(i *
                                                                   block_size)]
        com_utils.pickle_save(
            data_iter, fileConfig.dir_ner_split +
            fileConfig.file_ner_eval_mention_split.format(i))
    print("success split test mention to:{} files".format(num))
コード例 #7
0
def create_eval_ner_data():
    if not os.path.exists(fileConfig.dir_ner):
        os.mkdir(fileConfig.dir_ner)
    eval_file = open(fileConfig.dir_data + fileConfig.file_eval_data, mode='r', encoding='utf-8')
    data_list = []
    for i, line in tqdm(enumerate(eval_file), desc='create eval ner data'):
        jstr = ujson.loads(line)
        text_id = jstr['text_id']
        text_list = list(jstr['text'])
        text_lenth = len(text_list)
        tag_list = [nerConfig.O_seg] * text_lenth
        data_list.append({'id': text_id, 'text': text_list, 'tag': tag_list})
    com_utils.pickle_save(data_list, fileConfig.dir_ner + fileConfig.file_ner_eval_data)
    print("success create ner eval data")
コード例 #8
0
ファイル: tfidf_main.py プロジェクト: xlk0101/short_essay_el
def train():
    datas = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict)
    vectorizer = TfidfVectorizer()
    train_sentence = []
    print("prepare train data")
    for key, data in tqdm(datas.items(), desc='init train data'):
        train_sentence.append(' '.join(cut_client.cut_text(data['text'])))
    print("start train tfidf model")
    X = vectorizer.fit_transform(train_sentence)
    print("save model and keyword")
    tfidf_save_data = [X, vectorizer]
    if not os.path.exists(fileConfig.dir_tfidf):
        os.mkdir(fileConfig.dir_tfidf)
    com_utils.pickle_save(tfidf_save_data, fileConfig.dir_tfidf + fileConfig.file_tfidf_save_data)
    print("success train and save tfidf file")
コード例 #9
0
def split_train_test_data():
    print('start split train data...')
    data_list = open(fileConfig.dir_data + fileConfig.file_train_data, 'r', encoding='utf-8').readlines()
    data_len = len(data_list)
    # train_size = int(data_len * comConfig.train_ratio)
    test_size = 10000
    random.seed(comConfig.random_seed)
    random.shuffle(data_list)

    # train_data = data_list[:train_size]
    train_data = data_list[:data_len - test_size]
    dev_data = data_list[data_len - test_size:data_len]
    com_utils.pickle_save(train_data, fileConfig.dir_data + fileConfig.file_train_pkl)
    com_utils.pickle_save(dev_data, fileConfig.dir_data + fileConfig.file_test_pkl)
    print("success split data set")
コード例 #10
0
def gen_simi_subject_list(file_path):
    print('start gen similar subject...')
    file_datas = com_utils.pickle_load(file_path)
    gensim_model = word2vec.Word2VecKeyedVectors.load(
        fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model)
    for item in tqdm(file_datas, 'gen simi subject'):
        mention_data = item['mention_data']
        for mention in mention_data:
            mention_text = mention['mention']
            try:
                mention['gen_subjects'] = get_simi_subject_list(
                    gensim_model.most_similar(positive=[mention_text], topn=5))
            except BaseException:
                mention['gen_subjects'] = []
    com_utils.pickle_save(file_datas, file_path)
    print('success gen similar subject...')
コード例 #11
0
def split_train_data(train_file_path=None, out_train_file=None, out_dev_file=None, is_split=True):
    data_list = com_utils.pickle_load(train_file_path)
    if not is_split:
        dev_list = com_utils.pickle_load(fileConfig.dir_ner + fileConfig.file_extend_ner_dev_data)
    data_len = len(data_list)
    # train_size = int(data_len * comConfig.train_ratio)
    test_size = 10000
    random.seed(comConfig.random_seed)
    random.shuffle(data_list)

    # train_data = data_list[:train_size]
    if is_split:
        train_data = data_list[:data_len - test_size]
        dev_data = data_list[data_len - test_size:data_len]
        com_utils.pickle_save(train_data, out_train_file)
        com_utils.pickle_save(dev_data, out_dev_file)
    else:
        train_data = data_list
        com_utils.pickle_save(train_data, out_train_file)
        com_utils.pickle_save(dev_list, out_dev_file)
    print("success split data set")
コード例 #12
0
def create_dev_mention_cands_data(index, mention_file, pd_file, alia_kb_df,
                                  out_file):
    print("start create {} mention cands".format(index))
    dev_mention_data = com_utils.pickle_load(mention_file)
    print("{} data length is {}".format(index, len(dev_mention_data)))
    pd_df = pandas.read_csv(pd_file)
    alia_kb_df = pandas.read_csv(alia_kb_df)
    alia_kb_df.fillna('')
    count = 0
    for dev_data in tqdm(dev_mention_data, desc='find {} cands'.format(index)):
        # count += 1
        # if (count < 465):
        #     continue
        mention_data = dev_data['mention_data']
        for mention in mention_data:
            mention_text = mention['mention']
            if mention_text is None:
                continue
            cands = []
            cand_ids = {}
            # match orginal
            mention_text_proc = com_utils.cht_to_chs(mention_text.lower())
            mention_text_proc = com_utils.complete_brankets(mention_text_proc)
            # print(mention_text_proc)
            mention_text_proc_extend = mention_text_proc[
                0:len(mention_text_proc) - 1]
            subject_df = data_utils.pandas_query(pd_df, 'subject',
                                                 mention_text_proc)
            for _, item in subject_df.iterrows():
                s_id = str(item['subject_id'])
                if cand_ids.get(s_id) is not None:
                    continue
                cand_ids[s_id] = 1
                subject = item['subject']
                # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject'])
                text = data_utils.get_all_text(item['subject'],
                                               ast.literal_eval(item['data']))
                cands.append({
                    'cand_id':
                    s_id,
                    'cand_subject':
                    subject,
                    'cand_text':
                    text,
                    'cand_type':
                    com_utils.get_kb_type(ast.literal_eval(item['type']))
                })
            # match more
            # subject_df = data_utils.pandas_query(pd_df, 'subject', mention_text_proc_extend)
            # for _, item in subject_df.iterrows():
            #     s_id = str(item['subject_id'])
            #     if cand_ids.get(s_id) is not None:
            #         continue
            #     cand_ids[s_id] = 1
            #     subject = item['subject']
            #     # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject'])
            #     text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data']))
            #     cands.append({'cand_id': s_id, 'cand_subject': subject, 'cand_text': text,
            #                   'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type']))})
            # match alias
            alias_subject_ids = []
            # match orginal
            alias_df = data_utils.pandas_query(alia_kb_df, 'subject',
                                               mention_text_proc)
            for _, item in alias_df.iterrows():
                a_id = str(item['subject_id'])
                if alias_subject_ids.__contains__(a_id):
                    continue
                alias_subject_ids.append(a_id)
            # match more
            # alias_df = data_utils.pandas_query(alia_kb_df, 'subject', mention_text_proc_extend)
            # for _, item in alias_df.iterrows():
            #     a_id = str(item['subject_id'])
            #     if alias_subject_ids.__contains__(a_id):
            #         continue
            #     alias_subject_ids.append(a_id)
            for alia_id in alias_subject_ids:
                alias_df = pd_df[pd_df['subject_id'] == int(alia_id)]
                for _, item in alias_df.iterrows():
                    b_id = str(item['subject_id'])
                    if cand_ids.get(b_id) is not None:
                        continue
                    cand_ids[b_id] = 1
                    subject = item['subject']
                    # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject'])
                    text = data_utils.get_all_text(
                        item['subject'], ast.literal_eval(item['data']))
                    cands.append({
                        'cand_id':
                        b_id,
                        'cand_subject':
                        subject,
                        'cand_text':
                        text,
                        'cand_type':
                        com_utils.get_kb_type(ast.literal_eval(item['type']))
                    })
            # match gen subject
            # gen_subject_ids = []
            # for gen_subject in mention['gen_subjects']:
            #     gen_text = com_utils.cht_to_chs(gen_subject.lower())
            #     alias_df = alia_kb_df[alia_kb_df['subject'] == gen_text]
            #     for _, item in alias_df.iterrows():
            #         a_id = str(item['subject_id'])
            #         if gen_subject_ids.__contains__(a_id):
            #             continue
            #         gen_subject_ids.append(a_id)
            #     for alia_id in gen_subject_ids:
            #         alias_df = pd_df[pd_df['subject_id'] == int(alia_id)]
            #         for _, item in alias_df.iterrows():
            #             b_id = str(item['subject_id'])
            #             if cand_ids.get(b_id) is not None:
            #                 continue
            #             cand_ids[b_id] = 1
            #             subject = item['subject']
            #             # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject'])
            #             text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data']))
            #             cands.append({'cand_id': b_id, 'cand_subject': subject, 'cand_text': text,
            #                           'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type']))})
            mention['cands'] = cands
    com_utils.pickle_save(dev_mention_data, out_file)
    print("success create {} dev data with mention and cands!".format(index))
コード例 #13
0
def create_dev_mention_data(mode, ner_datas, out_file):
    ner_datas = com_utils.pickle_load(ner_datas)
    jieba_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                       fileConfig.file_jieba_kb)
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    gen_more_words = data_utils.get_stopword_list(
        fileConfig.dir_stopword + fileConfig.file_analysis_gen_more)
    text_id = 1
    dev_mention_data = []
    # count = 0
    for data in tqdm(ner_datas, 'find entity'):
        # count += 1
        # if count < 1496:
        #     continue
        text = ''.join(data['text'])
        tag_list = data['tag']
        start_index = 0
        mention_length = 0
        is_find = False
        mentions = []
        type_dict = {}
        # use tag find
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg:
            if tag.find(nerConfig.B_seg) > -1 or (
                    tag.find(nerConfig.I_seg) > -1 and not is_find):
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length = 1
                is_find = True
            elif tag.find(nerConfig.E_seg) > -1 and not is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                start_index = i
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            # elif tag == nerConfig.I_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.I_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
            # elif tag == nerConfig.E_seg + nerConfig.KB_seg and is_find:
            elif tag.find(nerConfig.E_seg) > -1 and is_find:
                type_str = tag.split('_')[1]
                type_dict = com_utils.dict_add(type_dict, type_str)
                mention_length += 1
                mention = text[start_index:start_index + mention_length]
                mention = data_utils.strip_punctuation(mention)
                type_list = Counter(type_dict).most_common()
                mentions.append({
                    'mention': mention,
                    'offset': str(start_index),
                    'type': type_list[0][0]
                })
                is_find = False
                mention_length = 0
                type_dict = {}
            elif tag == nerConfig.O_seg:
                is_find = False
                mention_length = 0
                type_dict = {}
        # use jieba find
        jieba_entities = cut_client.cut_text(text)
        for i, tag in enumerate(tag_list):
            # if tag == nerConfig.B_seg + nerConfig.KB_seg or tag == nerConfig.I_seg + nerConfig.KB_seg or tag == nerConfig.E_seg + nerConfig.KB_seg:
            if tag.find(nerConfig.B_seg) > -1 or tag.find(
                    nerConfig.I_seg) > -1 or tag.find(nerConfig.E_seg) > -1:
                jieba_offset = i
                jieba_char = text[i]
                jieba_text = get_jieba_mention(jieba_entities, jieba_char,
                                               jieba_offset)
                if jieba_text is None:
                    continue
                elif jieba_text == '_' or jieba_text == '-':
                    continue
                elif data_utils.is_punctuation(jieba_text):
                    continue
                elif len(jieba_text) == 1:
                    continue
                elif stopwords.get(jieba_text) is not None:
                    continue
                # elif gen_more_words.get(jieba_text) is not None:
                #     continue
                jieba_offset = jieba_offset - jieba_text.find(jieba_char)
                if len(jieba_text) <= comConfig.max_jieba_cut_len and (
                        jieba_dict.get(jieba_text) is not None):
                    type_str = tag.split('_')[1] if tag.find('_') > -1 else 'O'
                    if jieba_text is None:
                        continue
                    if not is_already_find_mention(mentions, jieba_text,
                                                   jieba_offset):
                        mentions.append({
                            'mention': jieba_text,
                            'offset': str(jieba_offset),
                            'type': type_str
                        })
        # find inner brackets mentions
        bracket_mentions = data_utils.get_mention_inner_brackets(
            text, tag_list)
        if len(bracket_mentions) > 0:
            mentions += bracket_mentions
        # completion mentions
        # mentions_com = []
        # for mention in mentions:
        #     mention_str = mention['mention']
        #     try:
        #         for find in re.finditer(mention_str, text):
        #             find_offset = find.span()[0]
        #             if find_offset != int(mention['offset']):
        #                 mentions_com.append(
        #                     {'mention': mention['mention'], 'offset': str(find_offset), 'type': mention['type']})
        #     except BaseException:
        #         # print("occur error when match mention str in completion mentions, error value:{} text:{}".format(
        #         #     mention_str, text))
        #         pass
        #     mentions_com.append(mention)
        # mentions = mentions_com
        # optim mentions
        delete_mentions = []
        mentions.sort(key=get_mention_len)
        for mention in mentions:
            mention_offset = int(mention['offset'])
            mention_len = len(mention['mention'])
            for sub_mention in mentions:
                if mention_offset != int(sub_mention['offset']) and int(
                        sub_mention['offset']) in range(
                            mention_offset, mention_offset + mention_len):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
                if mention_offset == int(sub_mention['offset']) and len(
                        mention['mention']) > len(sub_mention['mention']):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
        if len(delete_mentions) > 0:
            change_mentions = []
            for mention in mentions:
                if not data_utils.is_mention_already_in_list(
                        delete_mentions, mention):
                    change_mentions.append(mention)
            mentions = change_mentions
        change_mentions = []
        for mention in mentions:
            if not data_utils.is_mention_already_in_list(
                    change_mentions, mention
            ) and mention['mention'] not in comConfig.punctuation:
                change_mentions.append(mention)
        mentions = change_mentions
        # optim mentions
        # sort mentions
        mentions.sort(key=get_offset)
        # optimize the mention data
        mentions_optim = []
        for mention in mentions:
            mentions_optim.append({
                'mention':
                get_optim_mention_text(jieba_entities, mention['mention']),
                'offset':
                mention['offset'],
                'type':
                mention['type']
            })
        if mode == 1:
            dev_mention_data.append({
                'text_id': str(text_id),
                'text': text,
                'mention_data': mentions_optim
            })
        elif mode == 2:
            dev_mention_data.append({
                'text_id':
                str(text_id),
                'text':
                text,
                'mention_data':
                mentions_optim,
                'mention_data_original':
                data['mention_data_original']
            })
        elif mode == 3:
            dev_mention_data.append({
                'text_id': str(text_id),
                'text': text,
                'mention_data': mentions_optim
            })
        text_id += 1
    com_utils.pickle_save(dev_mention_data, out_file)
    print("success create dev data with mentions, mode:{}".format(mode))