for t in txt.split(' '): for s in re.findall(u'.*?。', t): if len(s) <= maxlen - 2: sents.append(s) novels.append(sents) _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict) # 建立临时分词器 if os.path.exists(lm_config): tokens = json.load(open(lm_config)) else: tokens = {} for novel in novels: for s in novel: for t in _tokenizer.tokenize(s): tokens[t] = tokens.get(t, 0) + 1 tokens = [(i, j) for i, j in tokens.items() if j >= min_count] tokens = sorted(tokens, key=lambda t: -t[1]) tokens = [t[0] for t in tokens] json.dump(tokens, codecs.open(lm_config, 'w', encoding='utf-8'), indent=4, ensure_ascii=False) token_dict, keep_words = {}, [] # keep_words是在bert中保留的字表 for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']: token_dict[t] = len(token_dict) keep_words.append(_token_dict[t])
train_samples = int(samples * TRAIN_SPLIT) dev_samples = int(samples * DEV_SPLIT) train_data, train_labels = all_data[:train_samples], all_labels[:train_samples] dev_data, dev_labels = all_data[train_samples:train_samples + dev_samples], all_labels[ train_samples:train_samples + dev_samples] test_data, test_labels = all_data[train_samples + dev_samples:], all_labels[train_samples + dev_samples:] # 加载预训练模型的词典 _token_dict = load_vocab(DICT_PATH) _tokenizer = Tokenizer(_token_dict, do_lower_case=True) print(all_data[0]) print(_tokenizer.encode(all_data[0])) print(_tokenizer.tokenize(all_data[0])) print([_tokenizer.id_to_token(21934)]) print(_tokenizer.token_to_id('[PAD]')) # 统计数据集中的词频 counter = Counter() for line in all_data: _tokens = _tokenizer.tokenize(line) # 统计词频时,移除[CLS]和[SEP]字符 counter.update(_tokens[1:-1]) print(len(counter)) # 移除词频较低的词 _tokens = [ token for token, cnt in counter.items() if cnt >= MIN_WORD_FREQUENCY ] print(len(_tokens))
break if ignore_flag: continue # 长度不能超过最大长度 if len(last_part) > max_len - 2: continue poetry.append(last_part) # 预训练模型中的词典和分词器 _token_dict = load_vocab(dict_path) _tokenizer = Tokenizer(dict_path, do_lower_case=True) # 统计所有词的词频 word_frequency_count = defaultdict(int) for line in poetry: for t in _tokenizer.tokenize(line): word_frequency_count[t] += 1 # 过滤掉低频词 tokens = [(token, count) for token, count in word_frequency_count.items() if count >= min_word_frequency] # 按词频排序 tokens = sorted(tokens, key=lambda x: -x[1]) # 去掉词频,只保留词列表 tokens = [token for token, count in tokens] # 构建新的token->id映射关系、和新词表 token_id_dict = {} keep_words = [] # 将特殊词加入到词典中 for token in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']: token_id_dict[token] = len(token_id_dict)