maxlen = 100 config_path = '/root/kg/bert/albert_base_zh/bert_config.json' checkpoint_path = '/root/kg/bert/albert_base_zh/bert_model.ckpt' dict_path = '/root/kg/bert/albert_base_zh/vocab.txt' neg = pd.read_excel('datasets/neg.xls', header=None) pos = pd.read_excel('datasets/pos.xls', header=None) data, tokens = [], {} _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict) # 建立临时分词器 for d in neg[0]: data.append((d, 0)) for t in _tokenizer.tokenize(d): tokens[t] = tokens.get(t, 0) + 1 for d in pos[0]: data.append((d, 1)) for t in _tokenizer.tokenize(d): tokens[t] = tokens.get(t, 0) + 1 tokens = {i: j for i, j in tokens.items() if j >= 4} token_dict, keep_words = {}, [] # keep_words是在bert中保留的字表 for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']: token_dict[t] = len(token_dict) keep_words.append(_token_dict[t]) for t in tokens:
for t in txt.split(' '): for s in re.findall(u'.*?。', t): if len(s) <= maxlen - 2: sents.append(s) novels.append(sents) _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict) # 建立临时分词器 if os.path.exists(lm_config): tokens = json.load(open(lm_config)) else: tokens = {} for novel in novels: for s in novel: for t in _tokenizer.tokenize(s): tokens[t] = tokens.get(t, 0) + 1 tokens = [(i, j) for i, j in tokens.items() if j >= min_count] tokens = sorted(tokens, key=lambda t: -t[1]) tokens = [t[0] for t in tokens] json.dump(tokens, codecs.open(lm_config, 'w', encoding='utf-8'), indent=4, ensure_ascii=False) token_dict, keep_words = {}, [] # keep_words是在bert中保留的字表 for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']: token_dict[t] = len(token_dict) keep_words.append(_token_dict[t])