class ALBertEmbedding(ModelBase): '''通过ALBert计算句向量 ''' def __init__(self, config_path=const.ALBERT_CONFIG_PATH, albert_checkpoint_path = const.ALBERT_CHECKPOINT_PATH, dict_path = const.ALBERT_DICT_PATH, train_mode=False ): self.session = tf.Session() keras.backend.set_session(self.session) if train_mode: self.bert = build_bert_model( model='albert', config_path=config_path, checkpoint_path=albert_checkpoint_path, with_pool=True, return_keras_model=False,) else: self.bert = build_bert_model( model='albert', config_path=config_path, # checkpoint_path=albert_checkpoint_path, with_pool=True, return_keras_model=False,) self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0]) self.tokenizer = Tokenizer(dict_path, do_lower_case=True) self.encoder.load_weights(albert_checkpoint_path, by_name=True) def init(self, words_list=None, update=True): if words_list!=None: token_ids_list, segment_ids_list = [], [] for words in words_list: token_ids, segment_ids = self.tokenizer.encode(words) token_ids_list.append(token_ids) segment_ids_list.append(segment_ids) token_ids_list = sequence_padding(token_ids_list) segment_ids_list = sequence_padding(segment_ids_list) self.words_list_pre = self.encoder.predict([token_ids_list, segment_ids_list]) self.words_list_pre = self._normalize(self.words_list_pre) return self def _predict(self, words): with self.session.as_default(): with self.session.graph.as_default(): token_ids, segment_ids = self.tokenizer.encode( words ) pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])]) pre = self._normalize(pre) return pre # 句向量 def predict(self, words): with self.session.as_default(): with self.session.graph.as_default(): token_ids, segment_ids = self.tokenizer.encode( words ) pre = self.encoder.predict([np.array([token_ids]), np.array([segment_ids])]) pre = self._normalize(pre) return np.dot( self.words_list_pre[:], pre[0] )
class SiameseDataGenerator(DataGenerator): """ SiameseBert的数据生成器,生成的数据组成为: """ def __init__(self, data_path: str, batch_size: int, maxlen: int, dict_path: str): super().__init__(data=self.__load_data(data_path), batch_size=batch_size) self._tokenizer = Tokenizer(dict_path, do_lower_case=True) self._maxlen = maxlen @staticmethod def __load_data(filename: str): D = [] with open(filename, encoding='utf-8') as f: for line in f: category, text1, text2, label = line.strip().split(',') if category != 'category': # 过滤掉columns数据行 D.append((text1, text2, int(label))) return D def __iter__(self, random=False): idxs = list(range(len(self.data))) if random: np.random.shuffle(idxs) q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids, \ batch_labels = [], [], [], [], [] for i in idxs: text1, text2, label = self.data[i] q1_token_ids, q1_segment_ids = self._tokenizer.encode( text1, max_length=self._maxlen) q2_token_ids, q2_segment_ids = self._tokenizer.encode( text2, max_length=self._maxlen) q1_batch_token_ids.append(q1_token_ids) q2_batch_token_ids.append(q2_token_ids) q1_batch_segment_ids.append(q1_segment_ids) q2_batch_segment_ids.append(q2_segment_ids) batch_labels.append([label]) if len(batch_labels) == self.batch_size or i == idxs[-1]: q1_batch_token_ids = sequence_padding(q1_batch_token_ids) q2_batch_token_ids = sequence_padding(q2_batch_token_ids) q1_batch_segment_ids = sequence_padding(q1_batch_segment_ids) q2_batch_segment_ids = sequence_padding(q2_batch_segment_ids) batch_labels = sequence_padding(batch_labels) yield [ q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids ], batch_labels q1_batch_token_ids, q1_batch_segment_ids, q2_batch_token_ids, q2_batch_segment_ids, \ batch_labels = [], [], [], [], []
def __init__(self, config_path=const.BERT_CONFIG_PATH, checkpoint_path=const.BERT_CHECKPOINT_PATH, dict_path=const.BERT_DICT_PATH, train_mode=False): self.session = tf.Session() keras.backend.set_session(self.session) self.bert = build_bert_model( config_path, checkpoint_path, with_pool='linear', # application='seq2seq', return_keras_model=False, ) self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0]) self.tokenizer = Tokenizer(dict_path, do_lower_case=True)
def build_model(mode='bert', filename='bert', lastfour=False, LR=1e-5, DR=0.2): path = '../data/External/'+filename+'/' config_path = path+'bert_config.json' checkpoint_path = path+'bert_model.ckpt' dict_path = path+'vocab.txt' global tokenizer tokenizer = Tokenizer(dict_path, do_lower_case=True) bert = build_bert_model( config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, model=mode, return_keras_model=False, ) if lastfour: model = Model( inputs=bert.model.input, outputs=[ bert.model.layers[-3].get_output_at(0), bert.model.layers[-11].get_output_at(0), bert.model.layers[-19].get_output_at(0), bert.model.layers[-27].get_output_at(0), ] ) output = model.outputs output1 = Lambda(lambda x: x[:, 0], name='Pooler1')(output[0]) output2 = Lambda(lambda x: x[:, 0], name='Pooler2')(output[1]) output3 = Lambda(lambda x: x[:, 0], name='Pooler3')(output[2]) output4 = Lambda(lambda x: x[:, 0], name='Pooler4')(output[3]) output = Concatenate(axis=1)([output1, output2, output3, output4]) else: output = bert.model.output output = Dropout(rate=DR)(output) output = Dense(units=2, activation='softmax', kernel_initializer=bert.initializer)(output) model = Model(bert.model.input, output) model.compile( loss='sparse_categorical_crossentropy', optimizer=Adam(LR), metrics=['accuracy'], ) return model
def __init__(self, config_path=const.ALBERT_CONFIG_PATH, albert_checkpoint_path = const.ALBERT_CHECKPOINT_PATH, dict_path = const.ALBERT_DICT_PATH, train_mode=False ): self.session = tf.Session() keras.backend.set_session(self.session) if train_mode: self.bert = build_bert_model( model='albert', config_path=config_path, checkpoint_path=albert_checkpoint_path, with_pool=True, return_keras_model=False,) else: self.bert = build_bert_model( model='albert', config_path=config_path, # checkpoint_path=albert_checkpoint_path, with_pool=True, return_keras_model=False,) self.encoder = keras.models.Model(self.bert.model.inputs, self.bert.model.outputs[0]) self.tokenizer = Tokenizer(dict_path, do_lower_case=True) self.encoder.load_weights(albert_checkpoint_path, by_name=True)
train_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 != 0] valid_data = [sogou_data[j] for i, j in enumerate(random_order) if i % 3 == 0] train_data.extend(train_data) train_data.extend(webqa_data) # 将SogouQA和WebQA按2:1的比例混合 # 加载并精简词表,建立分词器 _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] # keep_words是在bert中保留的字表 for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']: token_dict[t] = len(token_dict) keep_words.append(_token_dict[t]) for t, _ in sorted(_token_dict.items(), key=lambda s: s[1]): if t not in token_dict: if len(t) == 3 and (Tokenizer._is_cjk_character(t[-1]) or Tokenizer._is_punctuation(t[-1])): continue token_dict[t] = len(token_dict) keep_words.append(_token_dict[t]) tokenizer = Tokenizer(token_dict, do_lower_case=True) # 建立分词器 class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): """单条样本格式:[CLS]篇章[SEP]问题[SEP]答案[SEP] """ idxs = list(range(len(self.data)))
D.append((title, content)) return D # 加载数据集 train_data = load_data('/root/csl/train.tsv') valid_data = load_data('/root/csl/val.tsv') test_data = load_data('/root/csl/test.tsv') # 加载并精简词表,建立分词器 token_dict, keep_tokens = load_vocab( dict_path=dict_path, simplified=True, startwith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'], ) tokenizer = Tokenizer(token_dict, do_lower_case=True) class data_generator(DataGenerator): """数据生成器 """ def __iter__(self, random=False): idxs = list(range(len(self.data))) if random: np.random.shuffle(idxs) batch_token_ids, batch_segment_ids = [], [] for i in idxs: title, content = self.data[i] token_ids, segment_ids = tokenizer.encode(content, title, max_length=maxlen)
#! -*- coding: utf-8 -*- # 测试代码可用性: MLM from bert4keras.bert import build_bert_model from bert4keras.tokenizer import Tokenizer import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_bert_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = tokenizer._token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
txt = open(txt).read() txt = txt.decode('gbk', 'ignore') txt = txt.replace('\r', '').replace('\n', '') txt = txt.replace(u'整理制作,并提供下载', '') txt = re.sub(u'www.*?com', '', txt) txt = txt.replace(u'\u3000', ' ') sents = [] for t in txt.split(' '): for s in re.findall(u'.*?。', t): if len(s) <= maxlen - 2: sents.append(s) novels.append(sents) _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict, do_lower_case=True) # 建立临时分词器 if os.path.exists(lm_config): tokens = json.load(open(lm_config)) else: tokens = {} for novel in novels: for s in novel: for t in _tokenizer.tokenize(s): tokens[t] = tokens.get(t, 0) + 1 tokens = [(i, j) for i, j in tokens.items() if j >= min_count] tokens = sorted(tokens, key=lambda t: -t[1]) tokens = [t[0] for t in tokens] json.dump(tokens, codecs.open(lm_config, 'w', encoding='utf-8'), indent=4,
#! -*- coding: utf-8 -*- # 测试代码可用性: 提取特征 from bert4keras.backend import keras from bert4keras.bert import build_bert_model from bert4keras.tokenizer import Tokenizer import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path, do_lower_case=True) # 建立分词器 model = build_bert_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166
min_count = 128 maxlen = 256 batch_size = 16 steps_per_epoch = 1000 epochs = 10000 # bert配置 config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt' # 训练样本。THUCNews数据集,每个样本保存为一个txt。 txts = glob.glob('/root/thuctc/THUCNews/*/*.txt') _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict, do_lower_case=True) # 建立临时分词器 if os.path.exists(seq2seq_config): tokens = json.load(open(seq2seq_config)) else: def _batch_texts(): texts = [] for txt in txts: text = open(txt, encoding='utf-8').read() texts.append(text) if len(texts) == 100: yield texts texts = []
config_path = args.config_path checkpoint_path = args.checkpoint_path dict_path = args.dict_path min_count = 0 max_input_len = args.max_input_len max_output_len = args.max_output_len batch_size = args.batch_size epochs = args.epochs topk = args.topk train_data_path = args.train_data_path val_data_path = args.val_data_path token_dict = load_vocab(dict_path) # 读取词典 tokenizer = Tokenizer(token_dict, do_lower_case=True) # 建立分词器 sep_id = tokenizer.encode('')[0][-1] rouge = Rouge() model = get_model(config_path, checkpoint_path, args.albert, args.lr) evaluator = Evaluate(val_data_path, topk) model.fit_generator(DataGenerator(train_data_path, batch_size), epochs=epochs, callbacks=[evaluator])
# train_data = load_data('../spo_data/train1.json') # valid_data = load_data('../spo_data/dev1.json') filep = '/code/field_all_train_test_architecture_change_17w/spo_data' train_data = load_data(os.path.join(filep, 'train1.json')) valid_data = load_data(os.path.join(filep, 'dev1.json')) predicate2id, id2predicate = {}, {} with codecs.open(os.path.join(filep, 'all_50_schemas')) as f: for l in f: l = json.loads(l) if l['predicate'] not in predicate2id: id2predicate[len(predicate2id)] = l['predicate'] predicate2id[l['predicate']] = len(predicate2id) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=True) def search(pattern, sequence): """从sequence中寻找子串pattern 如果找到,返回第一个下标;否则返回-1。 """ n = len(pattern) for i in range(len(sequence)): if sequence[i:i + n] == pattern: return i return -1 class data_generator(DataGenerator): """数据生成器
def read_texts(): txts = glob.glob('../../thuctc/THUCNews/*/*.txt') np.random.shuffle(txts) for txt in txts: d = open(txt).read() d = d.decode('utf-8').replace(u'\u3000', ' ') d = d.split('\n') if len(d) > 1: title = d[0].strip() content = '\n'.join(d[1:]).strip() if len(title) <= max_output_len: yield content[:max_input_len], title _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict) # 建立临时分词器 if os.path.exists(seq2seq_config): tokens = json.load(open(seq2seq_config)) else: def _batch_texts(): texts = [] for text in read_texts(): texts.extend(text) if len(texts) == 1000: yield texts texts = [] if texts:
print(train[:10]) text = codecs.open('val.txt', encoding='utf-8') for line in text.readlines(): line = line.strip().replace(',', '').replace('.', '').replace(' ', '') valid.append(line) print(valid[:10]) text = codecs.open('test.txt', encoding='utf-8') for line in text.readlines(): line = line.strip().replace(',', '').replace('.', '').replace(' ', '') test.append(line) print(test[:10]) _token_dict = load_vocab(dict_path) _tokenizer = Tokenizer(_token_dict) tokens = json.load(open('seq2seq_config.json', encoding='utf-8')) token_dict, keep_words = {}, [] for t in ['[PAD]', '[UNK]', '[CLS]', '[SEP]']: token_dict[t] = len(token_dict) keep_words.append(_token_dict[t]) for t in tokens: if t in _token_dict and t not in token_dict: token_dict[t] = len(token_dict) keep_words.append(_token_dict[t]) tokenizer = Tokenizer(token_dict)
#! -*- coding: utf-8 -*- # 测试代码可用性: 提取特征 from bert4keras.backend import keras from bert4keras.bert import build_bert_model from bert4keras.tokenizer import Tokenizer import numpy as np config_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = '/root/kg/bert/chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = '/root/kg/bert/chinese_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path) # 建立分词器 model = build_bert_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print('\n ===== predicting =====\n') print(model.predict([np.array([token_ids]), np.array([segment_ids])])) """ 输出: [[[-0.63251007 0.2030236 0.07936534 ... 0.49122632 -0.20493352 0.2575253 ] [-0.7588351 0.09651865 1.0718756 ... -0.6109694 0.04312154 0.03881441] [ 0.5477043 -0.792117 0.44435206 ... 0.42449304 0.41105673 0.08222899] [-0.2924238 0.6052722 0.49968526 ... 0.8604137 -0.6533166
np.random.shuffle(all_labels) # 划分数据集 samples = len(all_data) train_samples = int(samples * TRAIN_SPLIT) dev_samples = int(samples * DEV_SPLIT) train_data, train_labels = all_data[:train_samples], all_labels[:train_samples] dev_data, dev_labels = all_data[train_samples:train_samples + dev_samples], all_labels[ train_samples:train_samples + dev_samples] test_data, test_labels = all_data[train_samples + dev_samples:], all_labels[train_samples + dev_samples:] # 加载预训练模型的词典 _token_dict = load_vocab(DICT_PATH) _tokenizer = Tokenizer(_token_dict, do_lower_case=True) print(all_data[0]) print(_tokenizer.encode(all_data[0])) print(_tokenizer.tokenize(all_data[0])) print([_tokenizer.id_to_token(21934)]) print(_tokenizer.token_to_id('[PAD]')) # 统计数据集中的词频 counter = Counter() for line in all_data: _tokens = _tokenizer.tokenize(line) # 统计词频时,移除[CLS]和[SEP]字符 counter.update(_tokens[1:-1]) print(len(counter)) # 移除词频较低的词 _tokens = [
for txt in glob.glob('../金庸/*/*.txt'): txt = open(txt).read() txt = txt.decode('gbk', 'ignore') txt = txt.replace('\r', '').replace('\n', '') txt = txt.replace(u'整理制作,并提供下载', '') txt = re.sub(u'www.*?com', '', txt) txt = txt.replace(u'\u3000', ' ') sents = [] for t in txt.split(' '): for s in re.findall(u'.*?。', t): if len(s) <= maxlen - 2: sents.append(s) novels.append(sents) _token_dict = load_vocab(dict_path) # 读取词典 _tokenizer = Tokenizer(_token_dict) # 建立临时分词器 if os.path.exists(lm_config): tokens = json.load(open(lm_config)) else: tokens = {} for novel in novels: for s in novel: for t in _tokenizer.tokenize(s): tokens[t] = tokens.get(t, 0) + 1 tokens = [(i, j) for i, j in tokens.items() if j >= min_count] tokens = sorted(tokens, key=lambda t: -t[1]) tokens = [t[0] for t in tokens] json.dump(tokens, codecs.open(lm_config, 'w', encoding='utf-8'), indent=4,
return dataset if __name__ == '__main__': # 使用测试 from bert4keras.tokenizer import Tokenizer import json, glob, re import jieba_fast as jieba from tqdm import tqdm jieba.initialize() dict_path = '/home/spaces_ac_cn/chinese_roberta_wwm_ext_L-12_H-768_A-12/vocab.txt' tokenizer = Tokenizer(dict_path) def some_texts(): for _ in range(2): # 数据重复两遍 filenames = glob.glob('/home/spaces_ac_cn/corpus/*/*/*') np.random.shuffle(filenames) for filename in filenames: with open(filename) as f: for l in f: l = json.loads(l)['text'].strip() yield re.findall(u'.*?[\n。]+', l) def word_segment(text): return jieba.lcut(text) TD = TrainingDataset(tokenizer, word_segment, sequence_length=512)
def __init__(self, data_path: str, batch_size: int, maxlen: int, dict_path: str): super().__init__(data=self.__load_data(data_path), batch_size=batch_size) self._tokenizer = Tokenizer(dict_path, do_lower_case=True) self._maxlen = maxlen
text = df['question'].tolist() label = df['tag'].tolist() label_dict = label_passer(label) for x, y in zip(text, label): D.append((x, label_dict[y])) return D, label_dict # 加载数据集 train_data, label_dict = load_data("D:/Workstations/Baidu-QuestionDB-Classification/Data/Output/history.csv") # 分割数据集 text_train, text_valid = train_test_split(train_data, random_state=2019, test_size=0.1) # 建立分词器 tokenizer = Tokenizer(dict_path, do_lower_case=False) # 加载预训练模型 bert = build_bert_model( config_path=config_path, checkpoint_path=checkpoint_path, with_pool=True, return_keras_model=True, model='albert' ) output = tf.keras.layers.Dropout(rate=0.1)(bert.output) output = tf.keras.layers.Dense(units=3, activation='softmax', name='classifier')(output) model = keras.models.Model(bert.input, output)
__, last_part = line.split(':') ignore_flag = False for dis_word in disallowed_words: if dis_word in last_part: ignore_flag = True break if ignore_flag: continue # 长度不能超过最大长度 if len(last_part) > max_len - 2: continue poetry.append(last_part) # 预训练模型中的词典和分词器 _token_dict = load_vocab(dict_path) _tokenizer = Tokenizer(dict_path, do_lower_case=True) # 统计所有词的词频 word_frequency_count = defaultdict(int) for line in poetry: for t in _tokenizer.tokenize(line): word_frequency_count[t] += 1 # 过滤掉低频词 tokens = [(token, count) for token, count in word_frequency_count.items() if count >= min_word_frequency] # 按词频排序 tokens = sorted(tokens, key=lambda x: -x[1]) # 去掉词频,只保留词列表 tokens = [token for token, count in tokens] # 构建新的token->id映射关系、和新词表 token_id_dict = {}