Example #1
0
class ALBertEmbedding(ModelBase):
    '''通过ALBert计算句向量
    '''
    def __init__(self, 
                       config_path=const.ALBERT_CONFIG_PATH, 
                       albert_checkpoint_path = const.ALBERT_CHECKPOINT_PATH, 
                       dict_path = const.ALBERT_DICT_PATH,
                       train_mode=False ):
        self.session = tf.Session() 
        keras.backend.set_session(self.session)
        if train_mode:
            self.bert = build_bert_model(
                         model='albert', 
                         config_path=config_path,
                         checkpoint_path=albert_checkpoint_path,
                         with_pool=True,
                         return_keras_model=False,)
        else:
            self.bert = build_bert_model(
                         model='albert', 
                         config_path=config_path,
                         # checkpoint_path=albert_checkpoint_path,
                         with_pool=True,
                         return_keras_model=False,)
            self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0])
            self.tokenizer = Tokenizer(dict_path, do_lower_case=True) 
            self.encoder.load_weights(albert_checkpoint_path, by_name=True)
    
    def init(self, words_list=None, update=True):
        if words_list!=None:
            token_ids_list, segment_ids_list = [], []
            for words in words_list:
                token_ids, segment_ids = self.tokenizer.encode(words)
                token_ids_list.append(token_ids)
                segment_ids_list.append(segment_ids)
            token_ids_list = sequence_padding(token_ids_list)
            segment_ids_list = sequence_padding(segment_ids_list)
            self.words_list_pre = self.encoder.predict([token_ids_list, segment_ids_list])
            self.words_list_pre = self._normalize(self.words_list_pre)
        return self
    
    def _predict(self, words):
        with self.session.as_default():
            with self.session.graph.as_default():
                token_ids, segment_ids = self.tokenizer.encode( words )
                pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])])
                pre = self._normalize(pre)
        return pre
        
    # 句向量 
    def predict(self, words):
        with self.session.as_default():
            with self.session.graph.as_default():
                token_ids, segment_ids = self.tokenizer.encode( words )
                pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])])
                pre = self._normalize(pre)
        return np.dot( self.words_list_pre[:], pre[0] ) 
class SiameseDataGenerator(DataGenerator):
    """
    SiameseBert的数据生成器,生成的数据组成为:
    """
    def __init__(self, data_path: str, batch_size: int, maxlen: int,
                 dict_path: str):
        super().__init__(data=self.__load_data(data_path),
                         batch_size=batch_size)
        self._tokenizer = Tokenizer(dict_path, do_lower_case=True)
        self._maxlen = maxlen

    @staticmethod
    def __load_data(filename: str):
        D = []
        with open(filename, encoding='utf-8') as f:
            for line in f:
                category, text1, text2, label = line.strip().split(',')
                if category != 'category':
                    # 过滤掉columns数据行
                    D.append((text1, text2, int(label)))
        return D

    def __iter__(self, random=False):
        idxs = list(range(len(self.data)))
        if random:
            np.random.shuffle(idxs)
        q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids, \
        batch_labels = [], [], [], [], []
        for i in idxs:
            text1, text2, label = self.data[i]
            q1_token_ids, q1_segment_ids = self._tokenizer.encode(
                text1, max_length=self._maxlen)
            q2_token_ids, q2_segment_ids = self._tokenizer.encode(
                text2, max_length=self._maxlen)

            q1_batch_token_ids.append(q1_token_ids)
            q2_batch_token_ids.append(q2_token_ids)
            q1_batch_segment_ids.append(q1_segment_ids)
            q2_batch_segment_ids.append(q2_segment_ids)
            batch_labels.append([label])

            if len(batch_labels) == self.batch_size or i == idxs[-1]:
                q1_batch_token_ids = sequence_padding(q1_batch_token_ids)
                q2_batch_token_ids = sequence_padding(q2_batch_token_ids)

                q1_batch_segment_ids = sequence_padding(q1_batch_segment_ids)
                q2_batch_segment_ids = sequence_padding(q2_batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)

                yield [
                    q1_batch_token_ids, q1_batch_segment_ids,
                    q2_batch_token_ids, q2_batch_segment_ids
                ], batch_labels

                q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids, \
                batch_labels = [], [], [], [], []
 def __init__(self,
              config_path=const.BERT_CONFIG_PATH,
              checkpoint_path=const.BERT_CHECKPOINT_PATH,
              dict_path=const.BERT_DICT_PATH,
              train_mode=False):
     self.session = tf.Session()
     keras.backend.set_session(self.session)
     self.bert = build_bert_model(
         config_path,
         checkpoint_path,
         with_pool='linear',
         # application='seq2seq',
         return_keras_model=False,
     )
     self.encoder = keras.models.Model(self.bert.model.inputs,
                                       self.bert.model.outputs[0])
     self.tokenizer = Tokenizer(dict_path, do_lower_case=True)
Example #4
0
def build_model(mode='bert', filename='bert', lastfour=False, LR=1e-5, DR=0.2):
    path = '../data/External/'+filename+'/'
    config_path = path+'bert_config.json'
    checkpoint_path = path+'bert_model.ckpt'
    dict_path = path+'vocab.txt'

    global tokenizer
    tokenizer = Tokenizer(dict_path, do_lower_case=True)

    bert = build_bert_model(
        config_path=config_path,
        checkpoint_path=checkpoint_path,
        with_pool=True,
        model=mode,
        return_keras_model=False,
    )
    if lastfour:
        model = Model(
            inputs=bert.model.input,
            outputs=[
                bert.model.layers[-3].get_output_at(0),
                bert.model.layers[-11].get_output_at(0),
                bert.model.layers[-19].get_output_at(0),
                bert.model.layers[-27].get_output_at(0),
            ]
        )
        output = model.outputs
        output1 = Lambda(lambda x: x[:, 0], name='Pooler1')(output[0])
        output2 = Lambda(lambda x: x[:, 0], name='Pooler2')(output[1])
        output3 = Lambda(lambda x: x[:, 0], name='Pooler3')(output[2])
        output4 = Lambda(lambda x: x[:, 0], name='Pooler4')(output[3])

        output = Concatenate(axis=1)([output1, output2, output3, output4])

    else:
        output = bert.model.output

    output = Dropout(rate=DR)(output)
    output = Dense(units=2,
                   activation='softmax',
                   kernel_initializer=bert.initializer)(output)

    model = Model(bert.model.input, output)

    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=Adam(LR),
        metrics=['accuracy'],
    )
    return model
Example #5
0
 def __init__(self, 
                    config_path=const.ALBERT_CONFIG_PATH, 
                    albert_checkpoint_path = const.ALBERT_CHECKPOINT_PATH, 
                    dict_path = const.ALBERT_DICT_PATH,
                    train_mode=False ):
     self.session = tf.Session() 
     keras.backend.set_session(self.session)
     if train_mode:
         self.bert = build_bert_model(
                      model='albert', 
                      config_path=config_path,
                      checkpoint_path=albert_checkpoint_path,
                      with_pool=True,
                      return_keras_model=False,)
     else:
         self.bert = build_bert_model(
                      model='albert', 
                      config_path=config_path,
                      # checkpoint_path=albert_checkpoint_path,
                      with_pool=True,
                      return_keras_model=False,)
         self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0])
         self.tokenizer = Tokenizer(dict_path, do_lower_case=True) 
         self.encoder.load_weights(albert_checkpoint_path, by_name=True)
Example #6
0
train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0]
valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0]
train_data.extend(train_data)
train_data.extend(webqa_data)  # 将SogouQA和WebQA按2:1的比例混合

# 加载并精简词表,建立分词器
_token_dict = load_vocab(dict_path)  # 读取词典
token_dict, keep_words = {}, []  # keep_words是在bert中保留的字表

for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
    token_dict[t] = len(token_dict)
    keep_words.append(_token_dict[t])

for t, _ in sorted(_token_dict.items(), key=lambda s: s[1]):
    if t not in token_dict:
        if len(t) == 3 and (Tokenizer._is_cjk_character(t[-1])
                            or Tokenizer._is_punctuation(t[-1])):
            continue
        token_dict[t] = len(token_dict)
        keep_words.append(_token_dict[t])

tokenizer = Tokenizer(token_dict, do_lower_case=True)  # 建立分词器


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        """单条样本格式:[CLS]篇章[SEP]问题[SEP]答案[SEP]
        """
        idxs = list(range(len(self.data)))
            D.append((title, content))
    return D


# 加载数据集
train_data = load_data('/root/csl/train.tsv')
valid_data = load_data('/root/csl/val.tsv')
test_data = load_data('/root/csl/test.tsv')

# 加载并精简词表,建立分词器
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict, do_lower_case=True)


class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        idxs = list(range(len(self.data)))
        if random:
            np.random.shuffle(idxs)
        batch_token_ids, batch_segment_ids = [], []
        for i in idxs:
            title, content = self.data[i]
            token_ids, segment_ids = tokenizer.encode(content,
                                                      title,
                                                      max_length=maxlen)
#! -*- coding: utf-8 -*-
# 测试代码可用性: MLM

from bert4keras.bert import build_bert_model
from bert4keras.tokenizer import Tokenizer
import numpy as np

config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True)  # 建立分词器
model = build_bert_model(config_path, checkpoint_path,
                         with_mlm=True)  # 建立模型,加载权重

token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力')

# mask掉“技术”
token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]']

# 用mlm模型预测被mask掉的部分
probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0]
print(tokenizer.decode(probas[3:5].argmax(axis=1)))  # 结果正是“技术”
Example #9
0
    txt = open(txt).read()
    txt = txt.decode('gbk', 'ignore')
    txt = txt.replace('\r', '').replace('\n', '')
    txt = txt.replace(u'整理制作,并提供下载', '')
    txt = re.sub(u'www.*?com', '', txt)
    txt = txt.replace(u'\u3000', ' ')
    sents = []
    for t in txt.split('  '):
        for s in re.findall(u'.*?。', t):
            if len(s) <= maxlen - 2:
                sents.append(s)
    novels.append(sents)


_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict, do_lower_case=True)  # 建立临时分词器

if os.path.exists(lm_config):
    tokens = json.load(open(lm_config))
else:
    tokens = {}
    for novel in novels:
        for s in novel:
            for t in _tokenizer.tokenize(s):
                tokens[t] = tokens.get(t, 0) + 1
    tokens = [(i, j) for i, j in tokens.items() if j >= min_count]
    tokens = sorted(tokens, key=lambda t: -t[1])
    tokens = [t[0] for t in tokens]
    json.dump(tokens,
              codecs.open(lm_config, 'w', encoding='utf-8'),
              indent=4,
#! -*- coding: utf-8 -*-
# 测试代码可用性: 提取特征

from bert4keras.backend import keras
from bert4keras.bert import build_bert_model
from bert4keras.tokenizer import Tokenizer
import numpy as np


config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器
model = build_bert_model(config_path, checkpoint_path) # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

"""
输出:
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
Example #11
0
min_count = 128
maxlen = 256
batch_size = 16
steps_per_epoch = 1000
epochs = 10000

# bert配置
config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'

# 训练样本。THUCNews数据集,每个样本保存为一个txt。
txts = glob.glob('/root/thuctc/THUCNews/*/*.txt')

_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict, do_lower_case=True)  # 建立临时分词器

if os.path.exists(seq2seq_config):

    tokens = json.load(open(seq2seq_config))

else:

    def _batch_texts():
        texts = []
        for txt in txts:
            text = open(txt, encoding='utf-8').read()
            texts.append(text)
            if len(texts) == 100:
                yield texts
                texts = []
Example #12
0

config_path = args.config_path
checkpoint_path = args.checkpoint_path
dict_path = args.dict_path

min_count = 0
max_input_len = args.max_input_len
max_output_len = args.max_output_len
batch_size = args.batch_size
epochs = args.epochs
topk = args.topk

train_data_path = args.train_data_path
val_data_path = args.val_data_path

token_dict = load_vocab(dict_path)  # 读取词典

tokenizer = Tokenizer(token_dict, do_lower_case=True)  # 建立分词器

sep_id = tokenizer.encode('')[0][-1]

rouge = Rouge()
model = get_model(config_path, checkpoint_path, args.albert, args.lr)

evaluator = Evaluate(val_data_path, topk)

model.fit_generator(DataGenerator(train_data_path, batch_size),
                    epochs=epochs,
                    callbacks=[evaluator])
Example #13
0
# train_data = load_data('../spo_data/train1.json')
# valid_data = load_data('../spo_data/dev1.json')
filep = '/code/field_all_train_test_architecture_change_17w/spo_data'
train_data = load_data(os.path.join(filep, 'train1.json'))
valid_data = load_data(os.path.join(filep, 'dev1.json'))
predicate2id, id2predicate = {}, {}

with codecs.open(os.path.join(filep, 'all_50_schemas')) as f:
    for l in f:
        l = json.loads(l)
        if l['predicate'] not in predicate2id:
            id2predicate[len(predicate2id)] = l['predicate']
            predicate2id[l['predicate']] = len(predicate2id)

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)


def search(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到,返回第一个下标;否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1


class data_generator(DataGenerator):
    """数据生成器
Example #14
0
def read_texts():
    txts = glob.glob('../../thuctc/THUCNews/*/*.txt')
    np.random.shuffle(txts)
    for txt in txts:
        d = open(txt).read()
        d = d.decode('utf-8').replace(u'\u3000', ' ')
        d = d.split('\n')
        if len(d) > 1:
            title = d[0].strip()
            content = '\n'.join(d[1:]).strip()
            if len(title) <= max_output_len:
                yield content[:max_input_len], title


_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict)  # 建立临时分词器

if os.path.exists(seq2seq_config):

    tokens = json.load(open(seq2seq_config))

else:

    def _batch_texts():
        texts = []
        for text in read_texts():
            texts.extend(text)
            if len(texts) == 1000:
                yield texts
                texts = []
        if texts:
print(train[:10])

text = codecs.open('val.txt', encoding='utf-8')
for line in text.readlines():
    line = line.strip().replace(',', '').replace('.', '').replace(' ', '')
    valid.append(line)
print(valid[:10])

text = codecs.open('test.txt', encoding='utf-8')
for line in text.readlines():
    line = line.strip().replace(',', '').replace('.', '').replace(' ', '')
    test.append(line)
print(test[:10])

_token_dict = load_vocab(dict_path)
_tokenizer = Tokenizer(_token_dict)

tokens = json.load(open('seq2seq_config.json', encoding='utf-8'))
token_dict, keep_words = {}, []

for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
    token_dict[t] = len(token_dict)
    keep_words.append(_token_dict[t])

for t in tokens:
    if t in _token_dict and t not in token_dict:
        token_dict[t] = len(token_dict)
        keep_words.append(_token_dict[t])

tokenizer = Tokenizer(token_dict)
Example #16
0
#! -*- coding: utf-8 -*-
# 测试代码可用性: 提取特征

from bert4keras.backend import keras
from bert4keras.bert import build_bert_model
from bert4keras.tokenizer import Tokenizer
import numpy as np


config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt'

tokenizer = Tokenizer(dict_path) # 建立分词器
model = build_bert_model(config_path, checkpoint_path) # 建立模型,加载权重

# 编码测试
token_ids, segment_ids = tokenizer.encode(u'语言模型')

print('\n ===== predicting =====\n')
print(model.predict([np.array([token_ids]), np.array([segment_ids])]))

"""
输出:
[[[-0.63251007  0.2030236   0.07936534 ...  0.49122632 -0.20493352
    0.2575253 ]
  [-0.7588351   0.09651865  1.0718756  ... -0.6109694   0.04312154
    0.03881441]
  [ 0.5477043  -0.792117    0.44435206 ...  0.42449304  0.41105673
    0.08222899]
  [-0.2924238   0.6052722   0.49968526 ...  0.8604137  -0.6533166
Example #17
0
np.random.shuffle(all_labels)
# 划分数据集
samples = len(all_data)
train_samples = int(samples * TRAIN_SPLIT)
dev_samples = int(samples * DEV_SPLIT)
train_data, train_labels = all_data[:train_samples], all_labels[:train_samples]
dev_data, dev_labels = all_data[train_samples:train_samples +
                                dev_samples], all_labels[
                                    train_samples:train_samples + dev_samples]
test_data, test_labels = all_data[train_samples +
                                  dev_samples:], all_labels[train_samples +
                                                            dev_samples:]

# 加载预训练模型的词典
_token_dict = load_vocab(DICT_PATH)
_tokenizer = Tokenizer(_token_dict, do_lower_case=True)
print(all_data[0])
print(_tokenizer.encode(all_data[0]))
print(_tokenizer.tokenize(all_data[0]))
print([_tokenizer.id_to_token(21934)])
print(_tokenizer.token_to_id('[PAD]'))

# 统计数据集中的词频
counter = Counter()
for line in all_data:
    _tokens = _tokenizer.tokenize(line)
    # 统计词频时,移除[CLS]和[SEP]字符
    counter.update(_tokens[1:-1])
print(len(counter))
# 移除词频较低的词
_tokens = [
for txt in glob.glob('../金庸/*/*.txt'):
    txt = open(txt).read()
    txt = txt.decode('gbk', 'ignore')
    txt = txt.replace('\r', '').replace('\n', '')
    txt = txt.replace(u'整理制作,并提供下载', '')
    txt = re.sub(u'www.*?com', '', txt)
    txt = txt.replace(u'\u3000', ' ')
    sents = []
    for t in txt.split('  '):
        for s in re.findall(u'.*?。', t):
            if len(s) <= maxlen - 2:
                sents.append(s)
    novels.append(sents)

_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict)  # 建立临时分词器

if os.path.exists(lm_config):
    tokens = json.load(open(lm_config))
else:
    tokens = {}
    for novel in novels:
        for s in novel:
            for t in _tokenizer.tokenize(s):
                tokens[t] = tokens.get(t, 0) + 1
    tokens = [(i, j) for i, j in tokens.items() if j >= min_count]
    tokens = sorted(tokens, key=lambda t: -t[1])
    tokens = [t[0] for t in tokens]
    json.dump(tokens,
              codecs.open(lm_config, 'w', encoding='utf-8'),
              indent=4,
Example #19
0
        return dataset


if __name__ == '__main__':

    # 使用测试

    from bert4keras.tokenizer import Tokenizer
    import json, glob, re
    import jieba_fast as jieba
    from tqdm import tqdm

    jieba.initialize()
    dict_path = '/home/spaces_ac_cn/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt'
    tokenizer = Tokenizer(dict_path)

    def some_texts():
        for _ in range(2): # 数据重复两遍
            filenames = glob.glob('/home/spaces_ac_cn/corpus/*/*/*')
            np.random.shuffle(filenames)
            for filename in filenames:
                with open(filename) as f:
                    for l in f:
                        l = json.loads(l)['text'].strip()
                        yield re.findall(u'.*?[\n。]+', l)

    def word_segment(text):
        return jieba.lcut(text)

    TD = TrainingDataset(tokenizer, word_segment, sequence_length=512)
 def __init__(self, data_path: str, batch_size: int, maxlen: int,
              dict_path: str):
     super().__init__(data=self.__load_data(data_path),
                      batch_size=batch_size)
     self._tokenizer = Tokenizer(dict_path, do_lower_case=True)
     self._maxlen = maxlen
Example #21
0
    text = df['question'].tolist()
    label = df['tag'].tolist()
    label_dict = label_passer(label)
    for x, y in zip(text, label):
        D.append((x, label_dict[y]))
    return D, label_dict


# 加载数据集
train_data, label_dict = load_data("D:/Workstations/Baidu-QuestionDB-Classification/Data/Output/history.csv")

# 分割数据集
text_train, text_valid = train_test_split(train_data, random_state=2019, test_size=0.1)

# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=False)


# 加载预训练模型
bert = build_bert_model(
    config_path=config_path,
    checkpoint_path=checkpoint_path,
    with_pool=True,
    return_keras_model=True,
    model='albert'
)

output = tf.keras.layers.Dropout(rate=0.1)(bert.output)
output = tf.keras.layers.Dense(units=3, activation='softmax', name='classifier')(output)

model = keras.models.Model(bert.input, output)
Example #22
0
    __, last_part = line.split(':')
    ignore_flag = False
    for dis_word in disallowed_words:
        if dis_word in last_part:
            ignore_flag = True
            break
    if ignore_flag:
        continue
    # 长度不能超过最大长度
    if len(last_part) > max_len - 2:
        continue
    poetry.append(last_part)

# 预训练模型中的词典和分词器
_token_dict = load_vocab(dict_path)
_tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 统计所有词的词频
word_frequency_count = defaultdict(int)
for line in poetry:
    for t in _tokenizer.tokenize(line):
        word_frequency_count[t] += 1
# 过滤掉低频词
tokens = [(token, count) for token, count in word_frequency_count.items() if count >= min_word_frequency]
# 按词频排序
tokens = sorted(tokens, key=lambda x: -x[1])
# 去掉词频,只保留词列表
tokens = [token for token, count in tokens]

# 构建新的token->id映射关系、和新词表
token_id_dict = {}