for t in txt.split('  '):
        for s in re.findall(u'.*?。', t):
            if len(s) <= maxlen - 2:
                sents.append(s)
    novels.append(sents)

_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict)  # 建立临时分词器

if os.path.exists(lm_config):
    tokens = json.load(open(lm_config))
else:
    tokens = {}
    for novel in novels:
        for s in novel:
            for t in _tokenizer.tokenize(s):
                tokens[t] = tokens.get(t, 0) + 1
    tokens = [(i, j) for i, j in tokens.items() if j >= min_count]
    tokens = sorted(tokens, key=lambda t: -t[1])
    tokens = [t[0] for t in tokens]
    json.dump(tokens,
              codecs.open(lm_config, 'w', encoding='utf-8'),
              indent=4,
              ensure_ascii=False)

token_dict, keep_words = {}, []  # keep_words是在bert中保留的字表

for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
    token_dict[t] = len(token_dict)
    keep_words.append(_token_dict[t])
Example #2
0
train_samples = int(samples * TRAIN_SPLIT)
dev_samples = int(samples * DEV_SPLIT)
train_data, train_labels = all_data[:train_samples], all_labels[:train_samples]
dev_data, dev_labels = all_data[train_samples:train_samples +
                                dev_samples], all_labels[
                                    train_samples:train_samples + dev_samples]
test_data, test_labels = all_data[train_samples +
                                  dev_samples:], all_labels[train_samples +
                                                            dev_samples:]

# 加载预训练模型的词典
_token_dict = load_vocab(DICT_PATH)
_tokenizer = Tokenizer(_token_dict, do_lower_case=True)
print(all_data[0])
print(_tokenizer.encode(all_data[0]))
print(_tokenizer.tokenize(all_data[0]))
print([_tokenizer.id_to_token(21934)])
print(_tokenizer.token_to_id('[PAD]'))

# 统计数据集中的词频
counter = Counter()
for line in all_data:
    _tokens = _tokenizer.tokenize(line)
    # 统计词频时,移除[CLS]和[SEP]字符
    counter.update(_tokens[1:-1])
print(len(counter))
# 移除词频较低的词
_tokens = [
    token for token, cnt in counter.items() if cnt >= MIN_WORD_FREQUENCY
]
print(len(_tokens))
Example #3
0
            break
    if ignore_flag:
        continue
    # 长度不能超过最大长度
    if len(last_part) > max_len - 2:
        continue
    poetry.append(last_part)

# 预训练模型中的词典和分词器
_token_dict = load_vocab(dict_path)
_tokenizer = Tokenizer(dict_path, do_lower_case=True)

# 统计所有词的词频
word_frequency_count = defaultdict(int)
for line in poetry:
    for t in _tokenizer.tokenize(line):
        word_frequency_count[t] += 1
# 过滤掉低频词
tokens = [(token, count) for token, count in word_frequency_count.items() if count >= min_word_frequency]
# 按词频排序
tokens = sorted(tokens, key=lambda x: -x[1])
# 去掉词频,只保留词列表
tokens = [token for token, count in tokens]

# 构建新的token->id映射关系、和新词表
token_id_dict = {}
keep_words = []

# 将特殊词加入到词典中
for token in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
    token_id_dict[token] = len(token_id_dict)