Exemple #1
0
def create_pandas_kb_alias_data():
    kb_file = open(fileConfig.dir_data + fileConfig.file_kb_data, 'r', encoding='utf-8')
    train_file = open(fileConfig.dir_data + fileConfig.file_train_data, 'r', encoding='utf-8')
    kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info + fileConfig.file_kb_dict)
    subject_id_list = []
    subject_list = []
    subjects = {}
    # from kb file
    for line in tqdm(kb_file, desc='deal kb_file'):
        jstr = ujson.loads(line)
        subject_id = jstr['subject_id']
        subject = com_utils.cht_to_chs(jstr['subject'].strip().lower())
        subject_id_list.append(subject_id)
        subject_list.append(subject)
        alias = jstr['alias']
        subjects[subject] = 1
        for alia in alias:
            alia_str = com_utils.cht_to_chs(alia.strip().lower())
            if subjects.get(alia_str) is not None:
                continue
            else:
                subjects[alia_str] = 1
                subject_id_list.append(subject_id)
                subject_list.append(alia_str)
    # from train file
    for line in tqdm(train_file, desc='deal train file'):
        jstr = ujson.loads(line)
        mention_data = jstr['mention_data']
        for mention in mention_data:
            mention_text = mention['mention']
            mention_text = com_utils.cht_to_chs(mention_text.lower())
            kb_id = mention['kb_id']
            kb_entity = kb_dict.get(kb_id)
            is_match = False
            if kb_entity is not None:
                kb_subject = kb_entity['subject']
                kb_alias = kb_entity['alias']
                if kb_subject == mention_text:
                    is_match = True
                if not is_match:
                    for alia in kb_alias:
                        if alia == mention_text:
                            is_match = True
                if not is_match:
                    if subjects.get(mention_text) is not None:
                        continue
                    else:
                        subjects[mention_text] = 1
                        subject_id_list.append(kb_id)
                        subject_list.append(mention_text)
    pandas_dict = {'subject_id': subject_id_list, 'subject': subject_list}
    df = pd.DataFrame.from_dict(pandas_dict)
    df.to_csv(fileConfig.dir_kb_info + fileConfig.file_kb_pandas_alias_data)
    print("success create pandas kb alia data file")
Exemple #2
0
def create_fasttext_sup_train_data(index, train_data_file, kb_dict_file, kb_alia_file, stopword_file, out_file,
                                   mode=fasttextConfig.create_data_word):
    print("create {} sup train data".format(index))
    kb_alias_df = pd.read_csv(kb_alia_file)
    stopwords = data_utils.get_stopword_list(stopword_file)
    train_datas = open(train_data_file, 'r', encoding='utf-8').readlines()
    kb_dict = com_utils.pickle_load(kb_dict_file)
    train_out_file = open(out_file, 'w', encoding='utf-8')
    text_ids = {}
    max_extend_countd = 3
    for line in tqdm(train_datas, desc='deal {} train file'.format(index)):
        jstr = ujson.loads(line)
        text = jstr['text']
        text_id = jstr['text_id']
        if text_ids.get(text_id) == max_extend_countd:
            continue
        mentions = jstr['mention_data']
        for mention in mentions:
            mention_id = mention['kb_id']
            mention_text = mention['mention']
            neighbor_text = com_utils.get_neighbor_sentence(text, mention_text)
            # true values
            kb_entity = kb_dict.get(mention_id)
            if kb_entity is not None:
                out_str = com_utils.get_entity_mention_pair_text(kb_entity['text'], neighbor_text, stopwords,
                                                                 cut_client,
                                                                 fasttextConfig.label_true, mode)
                train_out_file.write(out_str)
            # false values
            alia_ids = []
            alia_count = 0
            alias_df = kb_alias_df[kb_alias_df['subject'] == com_utils.cht_to_chs(mention_text)]
            for _, item in alias_df.iterrows():
                a_id = str(item['subject_id'])
                if a_id != mention_id:
                    alia_ids.append(a_id)
                    alia_count += 1
                    if alia_count == max_extend_countd:
                        break
            if len(alia_ids) > 0:
                for alia_id in alia_ids:
                    alia_entity = kb_dict.get(alia_id)
                    if alia_entity is not None:
                        out_str = com_utils.get_entity_mention_pair_text(alia_entity['text'], neighbor_text, stopwords,
                                                                         cut_client,
                                                                         fasttextConfig.label_false, mode)
                        train_out_file.write(out_str)
        # add text
        text_ids = com_utils.dict_add(text_ids, text_id)
    # 清理资源
    train_out_file.close()
    train_datas = None
    train_out_file = None
    kb_alias_df = None
    stopwords = None
    kb_dict = None
Exemple #3
0
def get_kb_text(kb_str, cut_client, stopwords):
    kb_datas = kb_str['data']
    result = kb_str['subject'] + ' '
    for kb_data in kb_datas:
        result += kb_data['predicate'] + ' '
        cut_texts = cut_client.cut_text(kb_data['object'])
        for text in cut_texts:
            if stopwords.get(text) is None and text != ' ':
                result += com_utils.cht_to_chs(text.strip('\n'))
                if not text.isdigit():
                    result += ' '
    return result[0:len(result) - 1]
Exemple #4
0
def create_pandas_kb_data():
    kb_file = open(fileConfig.dir_data + fileConfig.file_kb_data, 'r', encoding='utf-8')
    subject_id_list = []
    subject_list = []
    type_list = []
    data_list = []
    for line in tqdm(kb_file, desc='deal kb file'):
        jstr = ujson.loads(line)
        subject_id_list.append(jstr['subject_id'])
        subject_list.append(com_utils.cht_to_chs(jstr['subject'].lower()))
        type_list.append(jstr['type'])
        data_list.append(jstr['data'])
    pandas_dict = {'subject_id': subject_id_list, 'subject': subject_list, 'type': type_list,
                   'data': data_list}
    df = pd.DataFrame.from_dict(pandas_dict)
    df.to_csv(fileConfig.dir_kb_info + fileConfig.file_kb_pandas_csv)
    print("success create pandas kb file")
Exemple #5
0
def get_all_text(subject, datas):
    result_str = com_utils.cht_to_chs(subject) + ' '
    for data in datas:
        result_str += data['predicate'] + ' '
        result_str += data['object'] + ' '
    return result_str[0:len(result_str) - 1]
Exemple #6
0
def create_dev_mention_cands_data(index, mention_file, pd_file, alia_kb_df,
                                  out_file):
    print("start create {} mention cands".format(index))
    dev_mention_data = com_utils.pickle_load(mention_file)
    print("{} data length is {}".format(index, len(dev_mention_data)))
    pd_df = pandas.read_csv(pd_file)
    alia_kb_df = pandas.read_csv(alia_kb_df)
    alia_kb_df.fillna('')
    count = 0
    for dev_data in tqdm(dev_mention_data, desc='find {} cands'.format(index)):
        # count += 1
        # if (count < 465):
        #     continue
        mention_data = dev_data['mention_data']
        for mention in mention_data:
            mention_text = mention['mention']
            if mention_text is None:
                continue
            cands = []
            cand_ids = {}
            # match orginal
            mention_text_proc = com_utils.cht_to_chs(mention_text.lower())
            mention_text_proc = com_utils.complete_brankets(mention_text_proc)
            # print(mention_text_proc)
            mention_text_proc_extend = mention_text_proc[
                0:len(mention_text_proc) - 1]
            subject_df = data_utils.pandas_query(pd_df, 'subject',
                                                 mention_text_proc)
            for _, item in subject_df.iterrows():
                s_id = str(item['subject_id'])
                if cand_ids.get(s_id) is not None:
                    continue
                cand_ids[s_id] = 1
                subject = item['subject']
                # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject'])
                text = data_utils.get_all_text(item['subject'],
                                               ast.literal_eval(item['data']))
                cands.append({
                    'cand_id':
                    s_id,
                    'cand_subject':
                    subject,
                    'cand_text':
                    text,
                    'cand_type':
                    com_utils.get_kb_type(ast.literal_eval(item['type']))
                })
            # match more
            # subject_df = data_utils.pandas_query(pd_df, 'subject', mention_text_proc_extend)
            # for _, item in subject_df.iterrows():
            #     s_id = str(item['subject_id'])
            #     if cand_ids.get(s_id) is not None:
            #         continue
            #     cand_ids[s_id] = 1
            #     subject = item['subject']
            #     # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject'])
            #     text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data']))
            #     cands.append({'cand_id': s_id, 'cand_subject': subject, 'cand_text': text,
            #                   'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type']))})
            # match alias
            alias_subject_ids = []
            # match orginal
            alias_df = data_utils.pandas_query(alia_kb_df, 'subject',
                                               mention_text_proc)
            for _, item in alias_df.iterrows():
                a_id = str(item['subject_id'])
                if alias_subject_ids.__contains__(a_id):
                    continue
                alias_subject_ids.append(a_id)
            # match more
            # alias_df = data_utils.pandas_query(alia_kb_df, 'subject', mention_text_proc_extend)
            # for _, item in alias_df.iterrows():
            #     a_id = str(item['subject_id'])
            #     if alias_subject_ids.__contains__(a_id):
            #         continue
            #     alias_subject_ids.append(a_id)
            for alia_id in alias_subject_ids:
                alias_df = pd_df[pd_df['subject_id'] == int(alia_id)]
                for _, item in alias_df.iterrows():
                    b_id = str(item['subject_id'])
                    if cand_ids.get(b_id) is not None:
                        continue
                    cand_ids[b_id] = 1
                    subject = item['subject']
                    # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject'])
                    text = data_utils.get_all_text(
                        item['subject'], ast.literal_eval(item['data']))
                    cands.append({
                        'cand_id':
                        b_id,
                        'cand_subject':
                        subject,
                        'cand_text':
                        text,
                        'cand_type':
                        com_utils.get_kb_type(ast.literal_eval(item['type']))
                    })
            # match gen subject
            # gen_subject_ids = []
            # for gen_subject in mention['gen_subjects']:
            #     gen_text = com_utils.cht_to_chs(gen_subject.lower())
            #     alias_df = alia_kb_df[alia_kb_df['subject'] == gen_text]
            #     for _, item in alias_df.iterrows():
            #         a_id = str(item['subject_id'])
            #         if gen_subject_ids.__contains__(a_id):
            #             continue
            #         gen_subject_ids.append(a_id)
            #     for alia_id in gen_subject_ids:
            #         alias_df = pd_df[pd_df['subject_id'] == int(alia_id)]
            #         for _, item in alias_df.iterrows():
            #             b_id = str(item['subject_id'])
            #             if cand_ids.get(b_id) is not None:
            #                 continue
            #             cand_ids[b_id] = 1
            #             subject = item['subject']
            #             # text = data_utils.get_text(ast.literal_eval(item['data']), item['subject'])
            #             text = data_utils.get_all_text(item['subject'], ast.literal_eval(item['data']))
            #             cands.append({'cand_id': b_id, 'cand_subject': subject, 'cand_text': text,
            #                           'cand_type': com_utils.get_kb_type(ast.literal_eval(item['type']))})
            mention['cands'] = cands
    com_utils.pickle_save(dev_mention_data, out_file)
    print("success create {} dev data with mention and cands!".format(index))
def eval_sup(mode=fasttextConfig.create_data_word):
    print("start use the fasttext/supervised model to predict eval data")
    if not os.path.exists(fileConfig.dir_result):
        os.mkdir(fileConfig.dir_result)
    # unsup_model = fastText.load_model(
    #     fileConfig.dir_fasttext + fileConfig.file_fasttext_model.format(fasttextConfig.model_skipgram))
    unsup_model = word2vec.Word2VecKeyedVectors.load(
        fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model)
    sup_model = fastText.load_model(fileConfig.dir_fasttext +
                                    fileConfig.file_fasttext_sup_word_model)
    kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                    fileConfig.file_kb_dict)
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_eval_cands_data,
                    'r',
                    encoding='utf-8')
    out_file = open(fileConfig.dir_result + fileConfig.file_result_eval_data,
                    'w',
                    encoding='utf-8')
    # entity diambiguation
    for line in tqdm(dev_file, 'entity diambiguation'):
        if len(line.strip('\n')) == 0:
            continue
        jstr = ujson.loads(line)
        dev_entity = {}
        text = com_utils.cht_to_chs(jstr['text'].lower())
        dev_entity['text_id'] = jstr['text_id']
        dev_entity['text'] = jstr['text']
        mention_data = jstr['mention_data']
        mentions = []
        for mention in mention_data:
            mention_text = mention['mention']
            if mention_text is None:
                continue
            cands = mention['cands']
            if len(cands) == 0:
                continue
            # use supervised model to choose mention
            supervise_cands = []
            for cand in cands:
                neighbor_text = com_utils.get_neighbor_sentence(
                    text, com_utils.cht_to_chs(mention_text.lower()))
                cand_entity = kb_dict.get(cand['cand_id'])
                if cand_entity is not None:
                    out_str = com_utils.get_entity_mention_pair_text(
                        com_utils.cht_to_chs(cand_entity['text'].lower()),
                        neighbor_text,
                        stopwords,
                        cut_client,
                        mode=mode)
                    result = sup_model.predict(out_str.strip('\n'))[0][0]
                    if result == fasttextConfig.label_true:
                        supervise_cands.append(cand)
            if len(supervise_cands) == 0:
                supervise_cands = cands
            # unsupervise model choose item
            max_cand = None
            # score list
            score_list = []
            mention_neighbor_sentence = text
            for i, cand in enumerate(supervise_cands):
                # score = fasttext_get_sim(unsup_model, mention_neighbor_sentence,
                #                          com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords)
                score = gensim_get_sim(
                    unsup_model, mention_neighbor_sentence,
                    com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords)
                if score < fasttextConfig.min_entity_similarity_threshold:
                    continue
                score_list.append({
                    'cand_id': cand['cand_id'],
                    'cand_score': score,
                    'cand_type': cand['cand_type']
                })
            score_list.sort(key=get_socre_key, reverse=True)
            if len(score_list) > 0:
                max_cand = score_list[0]
            # find the best cand
            if max_cand is not None:
                mentions.append({
                    'kb_id': max_cand['cand_id'],
                    'mention': mention['mention'],
                    'offset': mention['offset']
                })
        # optim mentions
        delete_mentions = []
        mentions.sort(key=get_mention_len)
        for optim_mention in mentions:
            mention_offset = int(optim_mention['offset'])
            mention_len = len(optim_mention['mention'])
            for sub_mention in mentions:
                if mention_offset != int(sub_mention['offset']) and int(
                        sub_mention['offset']) in range(
                            mention_offset, mention_offset + mention_len):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
        if len(delete_mentions) > 0:
            change_mentions = []
            for optim_mention in mentions:
                if not data_utils.is_mention_already_in_list(
                        delete_mentions, optim_mention):
                    change_mentions.append(optim_mention)
            mentions = change_mentions
        change_mentions = []
        for optim_mention in mentions:
            if not data_utils.is_mention_already_in_list(
                    change_mentions, optim_mention
            ) and optim_mention['mention'] not in comConfig.punctuation:
                change_mentions.append(optim_mention)
        mentions = change_mentions
        mentions.sort(key=get_mention_offset)
        dev_entity['mention_data'] = mentions
        out_file.write(ujson.dumps(dev_entity, ensure_ascii=False))
        out_file.write('\n')
    print("success create supervised eval result")
def test_sup(mode=fasttextConfig.create_data_word):
    print("start use the fasttext model/supervise model to predict test data")
    if not os.path.exists(fileConfig.dir_result):
        os.mkdir(fileConfig.dir_result)
    unsup_model_fasttext = fastText.load_model(
        fileConfig.dir_fasttext +
        fileConfig.file_fasttext_model.format(fasttextConfig.choose_model))
    unsup_model_gensim = word2vec.Word2VecKeyedVectors.load(
        fileConfig.dir_fasttext + fileConfig.file_gensim_tencent_unsup_model)
    sup_model = fastText.load_model(fileConfig.dir_fasttext +
                                    fileConfig.file_fasttext_sup_word_model)
    stopwords = data_utils.get_stopword_list(fileConfig.dir_stopword +
                                             fileConfig.file_stopword)
    kb_dict = com_utils.pickle_load(fileConfig.dir_kb_info +
                                    fileConfig.file_kb_dict)
    dev_file = open(fileConfig.dir_ner + fileConfig.file_ner_test_cands_data,
                    'r',
                    encoding='utf-8')
    out_file = open(fileConfig.dir_result +
                    fileConfig.file_result_fasttext_test,
                    'w',
                    encoding='utf-8')
    # f1 parmas
    gen_mention_count = 0
    original_mention_count = 0
    correct_mention_count = 0
    # count = 0
    # entity diambiguation
    for line in tqdm(dev_file, 'entity diambiguation'):
        # count += 1
        # if count < 3456:
        #     continue
        jstr = ujson.loads(line)
        dev_entity = {}
        text = com_utils.cht_to_chs(jstr['text'].lower())
        dev_entity['text_id'] = jstr['text_id']
        dev_entity['text'] = jstr['text']
        mention_data = jstr['mention_data']
        original_mention_data = jstr['mention_data_original']
        mentions = []
        for mention in mention_data:
            mention_text = mention['mention']
            if mention_text is None:
                continue
            cands = mention['cands']
            if len(cands) == 0:
                continue
            # use supervised model to choose mention
            supervise_cands = []
            for cand in cands:
                neighbor_text = com_utils.get_neighbor_sentence(
                    text, com_utils.cht_to_chs(mention_text.lower()))
                cand_entity = kb_dict.get(cand['cand_id'])
                if cand_entity is not None:
                    out_str = com_utils.get_entity_mention_pair_text(
                        com_utils.cht_to_chs(cand_entity['text'].lower()),
                        neighbor_text,
                        stopwords,
                        cut_client,
                        mode=mode)
                    # print(out_str)
                    result = sup_model.predict(out_str.replace('\n',
                                                               ' '))[0][0]
                    if result == fasttextConfig.label_true:
                        supervise_cands.append(cand)
            # unsupervise model choose item
            max_cand = None
            if len(supervise_cands) == 0:
                supervise_cands = cands
            # score list
            score_list = []
            mention_neighbor_sentence = text
            for i, cand in enumerate(supervise_cands):
                # score_fasttext = fasttext_get_sim(unsup_model_fasttext, mention_neighbor_sentence,
                #                          com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords)
                score_gensim = gensim_get_sim(
                    unsup_model_gensim, mention_neighbor_sentence,
                    com_utils.cht_to_chs(cand['cand_text'].lower()), stopwords)
                # score = (0.8 * score_gensim) + (0.2 * score_fasttext)
                score = score_gensim
                # if score > max_score:
                #     max_score = score
                #     max_index = score
                if score < fasttextConfig.min_entity_similarity_threshold:
                    continue
                score_list.append({
                    'cand_id': cand['cand_id'],
                    'cand_score': score,
                    'cand_type': cand['cand_type']
                })
            # if max_score < fasttextConfig.min_entity_similarity_threshold:
            #     continue
            # find the best cand
            # find_type = False
            score_list.sort(key=get_socre_key, reverse=True)
            # for item in score_list:
            #     if item['cand_type'] == mention['type']:
            #         find_type = True
            # if find_type:
            #     for item in score_list:
            #         if item['cand_score'] > fasttextConfig.choose_entity_similarity_threshold:
            #             max_cand = item
            if max_cand is None:
                if len(score_list) > 0:
                    max_cand = score_list[0]
            # find the best cand
            if max_cand is not None:
                mentions.append({
                    'kb_id': max_cand['cand_id'],
                    'mention': mention['mention'],
                    'offset': mention['offset']
                })
        # optim mentions
        delete_mentions = []
        mentions.sort(key=get_mention_len)
        for mention in mentions:
            mention_offset = int(mention['offset'])
            mention_len = len(mention['mention'])
            for sub_mention in mentions:
                if mention_offset != int(sub_mention['offset']) and int(
                        sub_mention['offset']) in range(
                            mention_offset, mention_offset + mention_len):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
                if mention_offset == int(sub_mention['offset']) and len(
                        mention['mention']) > len(sub_mention['mention']):
                    if not data_utils.is_mention_already_in_list(
                            delete_mentions, sub_mention):
                        delete_mentions.append(sub_mention)
        if len(delete_mentions) > 0:
            change_mentions = []
            for mention in mentions:
                if not data_utils.is_mention_already_in_list(
                        delete_mentions, mention):
                    change_mentions.append(mention)
            mentions = change_mentions
        change_mentions = []
        for mention in mentions:
            if not data_utils.is_mention_already_in_list(
                    change_mentions, mention
            ) and mention['mention'] not in comConfig.punctuation:
                change_mentions.append(mention)
        mentions = change_mentions
        mentions.sort(key=get_mention_offset)
        # optim mentions
        # calc f1
        for mention in mentions:
            if is_find_correct_entity(mention['kb_id'], original_mention_data):
                correct_mention_count += 1
        gen_mention_count += len(mentions)
        for orginal_mention in original_mention_data:
            if orginal_mention['kb_id'] != 'NIL':
                original_mention_count += 1
        # out result
        dev_entity['mention_data'] = mentions
        dev_entity['mention_data_original'] = original_mention_data
        out_file.write(ujson.dumps(dev_entity, ensure_ascii=False))
        out_file.write('\n')
    precision = correct_mention_count / gen_mention_count
    recall = correct_mention_count / original_mention_count
    f1 = 2 * precision * recall / (precision + recall)
    print("success create test result, p:{:.4f} r:{:.4f} f1:{:.4f}".format(
        precision, recall, f1))
Exemple #9
0
def create_fasttext_unsup_train_data():
    print("start create unsup fasttext data...")
    if not os.path.exists(fileConfig.dir_fasttext):
        os.mkdir(fileConfig.dir_fasttext)
    kb_datas = open(fileConfig.dir_data + fileConfig.file_kb_data, 'r', encoding='utf-8')
    train_datas = open(fileConfig.dir_data + fileConfig.file_train_data, 'r', encoding='utf-8')
    dev_datas = open(fileConfig.dir_data + fileConfig.file_dev_data, 'r', encoding='utf-8')
    out_file = open(fileConfig.dir_fasttext + fileConfig.file_fasttext_unsup_train_data, 'w', encoding='utf-8')
    stopword_list = data_utils.get_stopword_list(fileConfig.dir_stopword + fileConfig.file_stopword)
    print("prepare train data")
    train_sentence = []
    # kb data
    for line in tqdm(kb_datas, desc='deal kb data'):
        jstr = ujson.loads(line)
        train_sentence.append(data_utils.get_kb_text(jstr, cut_client, stopword_list))
    # train data
    for line in tqdm(train_datas, desc='deal train data'):
        jstr = ujson.loads(line)
        text = jstr['text']
        text_len = len(text)
        save_str = ''
        str_point = 0
        mention_datas = jstr['mention_data']
        for mention in mention_datas:
            mention_offset = int(mention['offset'])
            mention_text = mention['mention']
            sub_text = text[str_point:mention_offset]
            cut_texts = cut_client.cut_text(sub_text)
            for s_text in cut_texts:
                if s_text != ' ':
                    save_str += com_utils.cht_to_chs(s_text)
                    if not s_text.isdigit():
                        save_str += ' '
            if len(sub_text) > 0 and not sub_text.isdigit():
                save_str += ' '
            str_point += mention_offset - str_point
            save_str += mention_text
            mention_text_len = len(mention_text)
            if mention_text_len > 0 and not mention_text.isdigit():
                save_str += ' '
            str_point += mention_text_len
        if str_point < text_len:
            sub_text = text[str_point:text_len]
            cut_texts = cut_client.cut_text(sub_text)
            for s_text in cut_texts:
                if s_text != ' ':
                    save_str += com_utils.cht_to_chs(s_text)
                    if not s_text.isdigit():
                        save_str += ' '
        train_sentence.append(save_str)
    # # dev data
    for line in tqdm(dev_datas, desc='deal dev data'):
        jstr = ujson.loads(line)
        text_list = cut_client.cut_text(jstr['text'].lower())
        save_str = ''
        for dev_text in text_list:
            if dev_text != ' ':
                save_str += com_utils.cht_to_chs(dev_text)
                if not dev_text.isdigit():
                    save_str += ' '
        train_sentence.append(save_str)
    line_len = len(train_sentence)
    print("save train data, data len:{}".format(line_len))
    for i, line in enumerate(train_sentence):
        if i < line_len - 1:
            out_file.writelines(line)
            out_file.write('\n')
        else:
            out_file.writelines(line)
    print("success save fasttext train file")