예제 #1
0
maxlen = 100
config_path = '/root/kg/bert/albert_base_zh/bert_config.json'
checkpoint_path = '/root/kg/bert/albert_base_zh/bert_model.ckpt'
dict_path = '/root/kg/bert/albert_base_zh/vocab.txt'

neg = pd.read_excel('datasets/neg.xls', header=None)
pos = pd.read_excel('datasets/pos.xls', header=None)
data, tokens = [], {}

_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict)  # 建立临时分词器

for d in neg[0]:
    data.append((d, 0))
    for t in _tokenizer.tokenize(d):
        tokens[t] = tokens.get(t, 0) + 1

for d in pos[0]:
    data.append((d, 1))
    for t in _tokenizer.tokenize(d):
        tokens[t] = tokens.get(t, 0) + 1

tokens = {i: j for i, j in tokens.items() if j >= 4}
token_dict, keep_words = {}, []  # keep_words是在bert中保留的字表

for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
    token_dict[t] = len(token_dict)
    keep_words.append(_token_dict[t])

for t in tokens:
예제 #2
0
    for t in txt.split('  '):
        for s in re.findall(u'.*?。', t):
            if len(s) <= maxlen - 2:
                sents.append(s)
    novels.append(sents)

_token_dict = load_vocab(dict_path)  # 读取词典
_tokenizer = Tokenizer(_token_dict)  # 建立临时分词器

if os.path.exists(lm_config):
    tokens = json.load(open(lm_config))
else:
    tokens = {}
    for novel in novels:
        for s in novel:
            for t in _tokenizer.tokenize(s):
                tokens[t] = tokens.get(t, 0) + 1
    tokens = [(i, j) for i, j in tokens.items() if j >= min_count]
    tokens = sorted(tokens, key=lambda t: -t[1])
    tokens = [t[0] for t in tokens]
    json.dump(tokens,
              codecs.open(lm_config, 'w', encoding='utf-8'),
              indent=4,
              ensure_ascii=False)

token_dict, keep_words = {}, []  # keep_words是在bert中保留的字表

for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']:
    token_dict[t] = len(token_dict)
    keep_words.append(_token_dict[t])