def albert_process_data(self, mode='part'): _token_dict = load_vocab(self.albert_dict_path) # 读取字典 # 只取涉及数据集中出现的字 if mode == 'part': train_df = pd.read_csv(self.train_data_path, names=['seq1', 'seq2', 'label']) valid_df = pd.read_csv(self.dev_data_path, names=['seq1', 'seq2', 'label']) test_df = pd.read_csv(self.test_data_path, names=['seq1', 'seq2', 'label']) # total data tmp_df = pd.concat([train_df, valid_df, test_df]) chars = defaultdict(int) for _, tmp_row in tmp_df.iterrows(): for tmp_char in tmp_row.seq1: chars[tmp_char] += 1 for tmp_char in tmp_row.seq2: chars[tmp_char] += 1 # 过滤低频字 chars = {i: j for i, j in chars.items() if j >= 4} self.token_dict, self.keep_words = {}, [] # keep_words是在bert中保留的字表 # 保留特殊字符 for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) # 字典只保留数据中出现的高频字 for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) elif mode == 'full': self.token_dict, self.keep_words = _token_dict, [] for k in self.token_dict: self.keep_words.append(self.token_dict[k]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器
def process_data(train_file, dev_file, test_file): chars = set() train_datas = read_data(train_file) dev_datas = read_data(dev_file) test_datas = read_data(test_file) for text1, text2, label in train_datas + dev_datas: chars.update(set(text1)) chars.update(set(text2)) _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) return train_datas, dev_datas, test_datas, tokenizer, keep_words
def save_vocab(self, input_data, incremental_train=False): relationships = set() chars = set() for (text, triple), (entity_lists, rel) in input_data: chars.update(set(text)) relationships.add(rel) relationships.update(set(p for s, p, o in triple)) token_dict = load_vocab(dict_path) # 读取词典 keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 # keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]'] rel2id = {rel: _id + 1 for _id, rel in enumerate(sorted(relationships))} rel2id['unk'] = 0 if not incremental_train: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(model_save_path, 'rel2id.pkl'), "wb") as f: pickle.dump(rel2id, f) self.tokenizer, self.keep_words, self.rel2id = tokenizer, keep_words, rel2id return tokenizer, keep_words, rel2id
def __init__(self, initial_model=True, model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')): self.initial_model = initial_model token_dict = load_vocab(DICT_PATH) self.tokenizer = SimpleTokenizer(token_dict) self.model_path = model_path if initial_model: self.albert_model = load_pretrained_model( CONFIG_PATH, CHECKPOINT_PATH, # keep_words=keep_words, albert=True) else: self.load(model_path) for l in self.albert_model.layers: l.trainable = True
def __init__(self, batch_size=32, train=False): self.batch_size = batch_size if train: chars = set() train_datas = read_datas(TRAIN_DATA_FILE) dev_datas = read_datas(DEV_DATA_FILE) test_datas = read_datas(TEST_DATA_FILE) for text1, text2, label in itertools.chain(train_datas, dev_datas): chars.update(set(text1)) chars.update(set(text2)) _token_dict = load_vocab(dict_path) # 读取词典 self.token_dict, self.keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(self.tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(self.keep_words, f) else: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "rb") as f: self.tokenizer = pickle.load(f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "rb") as f: self.keep_words = pickle.load(f) self.model = self.make_model()
def get_correct_fn(): save_path = join(MODEL_PATH, 'detect') token_dict = joblib.load( join(MODEL_PATH, 'train_pre_for_error_detect', 'token_dict.joblib')) tokenizer = SimpleTokenizer(token_dict) keep_words = joblib.load( join(MODEL_PATH, 'train_pre_for_error_detect', 'keep_words.joblib')) model = DetectModel(keep_words=keep_words) model.compile() model.model.load_weights(join(save_path, 'weights.hdf5')) checker = Statistics() def correct(error_text): text_tokens = tokenizer.tokenize(error_text, False, False)[:ec_cfg.max_seq_len - 2] tokens = list() tokens.append("[CLS]") for token in text_tokens: tokens.append(token) tokens.append("[SEP]") input_ids = [ token_dict[c] if c in token_dict.keys() else token_dict['[UNK]'] for c in tokens ] while len(input_ids) < ec_cfg.max_seq_len: input_ids.append(0) seg_ids = np.zeros_like(input_ids, dtype=np.int) ids, segs = [input_ids], [seg_ids] res = model.model.predict([ids, segs])[0][1:-1] begins_pred = [] lengths_pred = [] this_len = 0 for i, r in enumerate(res): if np.argmax(r) > 0: if this_len == 0: begins_pred.append(i) this_len += 1 else: if this_len > 0: lengths_pred.append(this_len) this_len = 0 else: if this_len > 0: lengths_pred.append(this_len) res_str = checker.correct(error_text, begins_pred, lengths_pred) return res_str return correct
def process_data(neg_file='datasets/neg.xls', pos_file='datasets/pos.xls'): neg = pd.read_excel(neg_file, header=None) pos = pd.read_excel(pos_file, header=None) chars = {} data = [] for d in neg[0]: data.append((d, 0)) for c in d: chars[c] = chars.get(c, 0) + 1 for d in pos[0]: data.append((d, 1)) for c in d: chars[c] = chars.get(c, 0) + 1 chars = {i: j for i, j in chars.items() if j >= 4} _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, set() for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) keep_words.add(max(keep_words) + 1) keep_words = list(keep_words) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) if not os.path.exists('./random_order.json'): random_order = [i for i in range(len(data))] random.shuffle(random_order) json.dump(random_order, open('./random_order.json', 'w'), indent=4) else: random_order = json.load(open('./random_order.json')) # 按照9:1的比例划分训练集和验证集 train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] return train_data, valid_data, tokenizer, keep_words
def save_vocab(self, save_path, process_data): chars = set() relationships = set() for text, relationship in process_data: words = split_text(text) chars.update(set(words)) relationships.add(relationship) token_dict = load_vocab(dict_path) # 读取词典 keep_chars = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]'] for char in chars: if not token_dict.get(char): # token_dict[char] = len(token_dict) keep_chars.append(char) # for char in keep_chars: # if not token_dict.get(char): # token_dict[char] = len(token_dict) keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 word2id = { word: id_ + len(keep_chars) for id_, word in enumerate(chars) } for _id, word in enumerate(keep_chars): word2id[word] = _id rel2id = {rel: _id for _id, rel in enumerate(relationships)} with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(save_path, 'word2id.pkl'), "wb") as f: pickle.dump(word2id, f) with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f: pickle.dump(rel2id, f) self.tokenizer, self.word2id, self.rel2id = tokenizer, word2id, rel2id return tokenizer, keep_words, word2id, rel2id
def save_vocab(self, save_path, process_data): flags = set() relationships = set() for old_word_flag, relationship in process_data: word_flag = [] for word, flag in old_word_flag: # if flag[0] == 'B': # flag = 'B-Shiyi' # elif flag[0] == 'I': # flag = 'I-Shiyi' word_flag.append([word, flag]) flags.update(set(flag for word, flag in word_flag)) relationships.add(relationship) token_dict = load_vocab(dict_path) # 读取词典 keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 keep_flags = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]'] flag2id = { label: id_ + len(keep_flags) for id_, label in enumerate( sorted(flags, key=lambda x: 0 if x == 'O' else 1)) } for flag_id, flag in enumerate(keep_flags): flag2id[flag] = flag_id rel2id = {rel: _id for _id, rel in enumerate(relationships)} with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(save_path, 'flag2id.pkl'), "wb") as f: pickle.dump(flag2id, f) with open(os.path.join(save_path, 'rel2id.pkl'), "wb") as f: pickle.dump(rel2id, f) self.tokenizer, self.flag2id, self.rel2id = tokenizer, flag2id, rel2id return tokenizer, keep_words, flag2id, rel2id
def save_vocab(self, model_save_path, process_data): chars = set() labels = set() for char_labels in process_data: for char, label in char_labels: chars.add(char) labels.add(label) _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, set() for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.add(_token_dict[c]) keep_words.add(max(keep_words) + 1) keep_words = list(keep_words) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) # print('labels={}'.format(labels)) # sorted: 保证 非实体词 O 的id为0 self.label2id = { label: id_ for id_, label in enumerate( sorted(labels, key=lambda x: 0 if x == 'O' else 1)) } print('label2id: {}'.format(self.label2id)) with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f: pickle.dump(self.label2id, f) self.keep_words = keep_words self.tokenizer = tokenizer
def save_word2id_etc(self, datas, incremental_train=False): label_set = set() _token_dict = load_vocab(dict_path) # 读取词典 # token_dict, keep_words = {}, set() token_dict = copy.deepcopy(_token_dict) # for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: # token_dict[c] = len(token_dict) # keep_words.add(_token_dict[c]) for chars, label in datas: label_set.add(label) # for c in chars: # if c in _token_dict: # token_dict[c] = len(token_dict) # keep_words.add(_token_dict[c]) # keep_words.add(max(keep_words)+1) # keep_words = list(keep_words) keep_words = list(set(token_dict.values())) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 label2id = {lab: i for i, lab in enumerate(list(label_set))} if not incremental_train: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(keep_words, f) with open(os.path.join(model_save_path, 'label2id.pkl'), "wb") as f: pickle.dump(label2id, f) return tokenizer, keep_words, label2id
import tensorflow as tf from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: print("Name:", gpu.name, " Type:", gpu.device_type) tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)]) print(tf.__version__) base_path = 'D:\AI\Data\chinese_L-12_H-768_A-12\\' config_path = base_path + 'bert_config.json' checkpoint_path = base_path + 'bert_model.ckpt' dict_path = base_path + 'vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path) # 建立模型,加载权重 # 编码测试 token_ids, segment_ids = tokenizer.encode(u'语言模型') print(model.predict([np.array([token_ids]), np.array([segment_ids])]))
import tensorflow as tf from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np gpus = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_virtual_device_configuration( gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=2048)]) base_path = 'D:\AI\Data\\albert_large_zh\\' config_path = base_path + 'albert_config_large.json' checkpoint_path = base_path + 'albert_model.ckpt' dict_path = base_path + 'vocab.txt' token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path, albert=True) # 建立模型,加载权重 token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') # mask掉“技术” token_ids[3] = token_ids[4] = token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”权
class Albert(object): def __init__(self, mode='inference', mode_='part', model_name=None, dataset_name=None): self.maxlen = 32 self.albert_config_path = '/Data/public/Bert/albert_tiny_489k/albert_config_tiny.json' self.albert_checkpoint_path = '/Data/public/Bert/albert_tiny_489k/albert_model.ckpt' self.albert_dict_path = '/Data/public/Bert/albert_tiny_489k/vocab.txt' self.train_data_path = 'data/train_{}.csv'.format(dataset_name) self.dev_data_path = 'data/dev_{}.csv'.format(dataset_name) self.test_data_path = 'data/test_{}.csv'.format(dataset_name) # albert_tiny_250k.h5 挺好的 # self.restore_model_path = 'saved_models/test_albert_tiny_{}.h5'.format(model_name) self.restore_model_path = '/Data/models/{}'.format(model_name) # albert self.albert_process_data(mode_) if mode == 'train': self.model = self._get_model() self.train() elif mode == 'inference': self._init_model() # todo keep words 工业场景下需要remove def albert_process_data(self, mode='part'): _token_dict = load_vocab(self.albert_dict_path) # 读取字典 # 只取涉及数据集中出现的字 if mode == 'part': train_df = pd.read_csv(self.train_data_path, names=['seq1', 'seq2', 'label']) valid_df = pd.read_csv(self.dev_data_path, names=['seq1', 'seq2', 'label']) test_df = pd.read_csv(self.test_data_path, names=['seq1', 'seq2', 'label']) # total data tmp_df = pd.concat([train_df, valid_df, test_df]) chars = defaultdict(int) for _, tmp_row in tmp_df.iterrows(): for tmp_char in tmp_row.seq1: chars[tmp_char] += 1 for tmp_char in tmp_row.seq2: chars[tmp_char] += 1 # 过滤低频字 chars = {i: j for i, j in chars.items() if j >= 4} self.token_dict, self.keep_words = {}, [] # keep_words是在bert中保留的字表 # 保留特殊字符 for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) # 字典只保留数据中出现的高频字 for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) elif mode == 'full': self.token_dict, self.keep_words = _token_dict, [] for k in self.token_dict: self.keep_words.append(self.token_dict[k]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器 # data pre-processing operation def _data_preprocessing(self, sentence1, sentence2): X1, X2 = [], [] for tmp_sent1, tmp_sent2 in zip(sentence1, sentence2): x1, x2 = self.tokenizer.encode(first=tmp_sent1[:self.maxlen], second=tmp_sent2[:self.maxlen]) X1.append(x1) X2.append(x2) X1 = self._seq_padding(X1) X2 = self._seq_padding(X2) # X1 = pad_sequences(X1, maxlen=67, padding='post', truncating='post') # X2 = pad_sequences(X2, maxlen=67, padding='post', truncating='post') return X1, X2 def _seq_padding(self, X, padding=0): L = [len(x) for x in X] ML = max(L) padded_sent = np.array([ np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X ]) return padded_sent # prepare data for training def _prepare_data(self, data_path): data = pd.read_csv(data_path) sent_1 = data['sentence1'].values sent_2 = data['sentence2'].values label = data['label'].values X1_pad, X2_pad = self._data_preprocessing(sent_1, sent_2) # X1 = np.vstack((X1_pad, X2_pad)) # X2 = np.vstack((X2_pad, X1_pad)) # y_train = np.hstack((label, label)) return X1_pad, X2_pad, label # albert for Semantic matching, model architecture def _get_model(self): model = load_pretrained_model( self.albert_config_path, self.albert_checkpoint_path, keep_words=self.keep_words, # 只保留keep_words中的字,精简原字表 albert=True) output = Lambda(lambda x: x[:, 0])(model.output) output = Dense(1, activation='sigmoid')(output) model = Model(model.input, output) return model # model training operation def train(self): # train_data train_x1, train_x2, train_label = self._prepare_data( self.train_data_path) # dev_data dev_x1, dev_x2, dev_label = self._prepare_data(self.dev_data_path) checkpoint = ModelCheckpoint(self.restore_model_path, monitor='val_accuracy', verbose=0, save_best_only=True, save_weights_only=False) early_stop = EarlyStopping(monitor='val_accuracy', patience=3, verbose=0, mode='auto', baseline=None, restore_best_weights=True) self.model.compile( loss='binary_crossentropy', optimizer=Adam(1e-4), # 用足够小的学习率 metrics=['accuracy']) self.model.summary() self.model.fit(x=[train_x1, train_x2], y=train_label, batch_size=64, epochs=10, verbose=1, callbacks=[checkpoint, early_stop], validation_data=([dev_x1, dev_x2], dev_label)) # model predict operation def predict(self, sentence1, sentence2): X1, X2 = self._data_preprocessing(sentence1, sentence2) y_pred = self.model.predict([X1, X2], batch_size=1024) return y_pred def test(self): self.model.compile( loss='binary_crossentropy', optimizer=Adam(1e-4), # 用足够小的学习率 metrics=['accuracy']) # test_data test_x1, test_x2, test_label = self._prepare_data(self.dev_data_path) test_loss, test_acc = self.model.evaluate(x=[test_x1, test_x2], y=test_label) print('test loss: {}'.format(test_loss)) print('test acc: {}'.format(test_acc)) def _init_model(self): self.model = load_model(self.restore_model_path) sentence1 = '干嘛呢' sentence2 = '你是机器人' print('model albert loaded succeed. ({})'.format( self.predict([sentence1], [sentence2]).item()))
chars = {i: j for i, j in chars.items() if j >= 4} _token_dict = load_vocab(dict_path) # 读取词典 token_dict, keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: token_dict[c] = len(token_dict) keep_words.append(_token_dict[c]) tokenizer = SimpleTokenizer(token_dict) # 建立分词器 if not os.path.exists('./random_order.json'): random_order = range(len(data)) np.random.shuffle(random_order) json.dump(random_order, open('./random_order.json', 'w'), indent=4) else: random_order = json.load(open('./random_order.json')) # 按照9:1的比例划分训练集和验证集 train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0] valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0] def seq_padding(X, padding=0): L = [len(x) for x in X]
class SemanticModel(): def __init__(self, batch_size=32, train=False): self.batch_size = batch_size if train: chars = set() train_datas = read_datas(TRAIN_DATA_FILE) dev_datas = read_datas(DEV_DATA_FILE) test_datas = read_datas(TEST_DATA_FILE) for text1, text2, label in itertools.chain(train_datas, dev_datas): chars.update(set(text1)) chars.update(set(text2)) _token_dict = load_vocab(dict_path) # 读取词典 self.token_dict, self.keep_words = {}, [] for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) for c in chars: if c in _token_dict: self.token_dict[c] = len(self.token_dict) self.keep_words.append(_token_dict[c]) self.tokenizer = SimpleTokenizer(self.token_dict) # 建立分词器 with open(os.path.join(model_save_path, 'tokenizer.pkl'), "wb") as f: pickle.dump(self.tokenizer, f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "wb") as f: pickle.dump(self.keep_words, f) else: with open(os.path.join(model_save_path, 'tokenizer.pkl'), "rb") as f: self.tokenizer = pickle.load(f) with open(os.path.join(model_save_path, 'keep_words.pkl'), "rb") as f: self.keep_words = pickle.load(f) self.model = self.make_model() def make_model(self): model = load_pretrained_model(config_path, checkpoint_path, keep_words=self.keep_words, albert=True) output = Lambda(lambda x: x[:, 0])(model.output) # print(output.shape) output = Dense(1, activation='sigmoid')(output) # tanh, sigmoid, softmax model = Model(inputs=model.input, outputs=output) model.compile( loss= 'binary_crossentropy', # categorical_crossentropy binary_crossentropy optimizer=Adam(2e-6), # 用足够小的学习率 # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}), metrics=['accuracy']) model.summary() return model def gnerator_data(self, file_name): X1, X2, Y = [], [], [] while True: for text1, text2, label in read_datas(file_name): text1 = text1[:INPUT_LENGTH] text2 = text2[:INPUT_LENGTH] text1 = unicodedata.normalize('NFKD', text1).strip().lower() text2 = unicodedata.normalize('NFKD', text2).strip().lower() x1, x2 = self.tokenizer.encode(first=text1, second=text2) y = int(label) X1.append(x1) X2.append(x2) Y.append([y]) # Y.append(to_categorical(y)) if len(X1) == self.batch_size: X1 = seq_padding(X1) X2 = seq_padding(X2) Y = seq_padding(Y) # print(X1.shape, X2.shape, Y.shape) yield [X1, X2], Y X1, X2, Y = [], [], [] def train(self): early_stopping = EarlyStopping(monitor='val_loss', patience=3) model_checkpoint = ModelCheckpoint(filepath=os.path.join( model_save_path, 'similarity-{epoch:02d}-{val_loss:.2f}-{val_acc:.3f}.hdf5'), save_best_only=True, save_weights_only=False) tb = TensorBoard( log_dir=log_dir, # log 目录 histogram_freq=0, # 按照何等频率(epoch)来计算直方图,0为不计算 batch_size=32, # 用多大量的数据计算直方图 write_graph=True, # 是否存储网络结构图 write_grads=False, # 是否可视化梯度直方图 write_images=False, # 是否可视化参数 embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None) hist = self.model.fit_generator( self.gnerator_data(TRAIN_DATA_FILE), steps_per_epoch=1000, epochs=100, validation_data=self.gnerator_data(DEV_DATA_FILE), validation_steps=100, callbacks=[early_stopping, model_checkpoint, tb]) print(hist.history.items()) def predict(self, text1, text2, weitht_file='similarity-01-0.55-0.741.hdf5'): self.model.load_weights(os.path.join(model_save_path, weitht_file), by_name=True, skip_mismatch=True, reshape=True) text1 = text1[:INPUT_LENGTH] text2 = text2[:INPUT_LENGTH] text1 = unicodedata.normalize('NFKD', text1).strip().lower() text2 = unicodedata.normalize('NFKD', text2).strip().lower() x1, x2 = self.tokenizer.encode(first=text1, second=text2) X1 = seq_padding([x1]) X2 = seq_padding([x2]) ret = self.model.predict([X1, X2]) return ret def batch_predict(self, question, database): text1 = question text1 = text1[:INPUT_LENGTH] X1, X2 = [], [] for text2 in database: text2 = text2[:INPUT_LENGTH] text1 = unicodedata.normalize('NFKD', text1).strip().lower() text2 = unicodedata.normalize('NFKD', text2).strip().lower() x1, x2 = self.tokenizer.encode(first=text1, second=text2) X1.append(x1) X2.append(x2) X1 = seq_padding(X1) X2 = seq_padding(X2) ret = self.model.predict([X1, X2]) return ret
import os from bert4keras.bert import load_pretrained_model from bert4keras.utils import SimpleTokenizer, load_vocab import numpy as np albert_model_path = '/home/gswyhq/github_projects/albert_zh/albert_large_zh' # albert_model_path = '/notebooks/albert_zh/albert_large_zh' # https://storage.googleapis.com/albert_zh/albert_large_zh.zip config_path = os.path.join(albert_model_path, 'albert_config_large.json') checkpoint_path = os.path.join(albert_model_path, 'albert_model.ckpt') dict_path = os.path.join(albert_model_path, 'vocab.txt') token_dict = load_vocab(dict_path) # 读取词典 tokenizer = SimpleTokenizer(token_dict) # 建立分词器 model = load_pretrained_model(config_path, checkpoint_path, with_mlm=True) # 建立模型,加载权重 # token_ids, segment_ids = tokenizer.encode(u'科学技术是第一生产力') token_ids, segment_ids = tokenizer.encode(u'中国的首都是北京') print('token_ids: {}, segment_ids: {}'.format(token_ids, segment_ids)) # mask掉“技术” # token_ids[3] = token_ids[4] = token_dict['[MASK]'] token_ids[4] = token_ids[5] = token_dict['[MASK]'] # 用mlm模型预测被mask掉的部分 probas = model.predict([np.array([token_ids]), np.array([segment_ids])])[0] # print(tokenizer.decode(probas[3:5].argmax(axis=1))) # 结果正是“技术”
class AlbertClassify: def __init__(self, initial_model=True, model_path=os.path.join(CONFIG['model_dir'], 'albert.h5')): self.initial_model = initial_model token_dict = load_vocab(DICT_PATH) self.tokenizer = SimpleTokenizer(token_dict) self.model_path = model_path if initial_model: self.albert_model = load_pretrained_model( CONFIG_PATH, CHECKPOINT_PATH, # keep_words=keep_words, albert=True) else: self.load(model_path) for l in self.albert_model.layers: l.trainable = True def train(self, train_data, valid_data): train_D = DataGenerator(train_data, self.tokenizer, CONFIG['batch_size'], CONFIG['max_len']) valid_D = DataGenerator(valid_data, self.tokenizer, CONFIG['batch_size'], CONFIG['max_len']) output = Lambda(lambda x: x[:, 0])(self.albert_model.output) output = Dense(1, activation='sigmoid')(output) self.model = Model(self.albert_model.input, output) save = ModelCheckpoint(os.path.join(self.model_path), monitor='val_acc', verbose=1, save_best_only=True, mode='auto') early_stopping = EarlyStopping(monitor='val_acc', min_delta=0, patience=3, verbose=1, mode='auto') callbacks = [save, early_stopping] if self.initial_model: x1_in = Input(shape=(None, )) x2_in = Input(shape=(None, )) x_in = self.albert_model([x1_in, x2_in]) x_in = Lambda(lambda x: x[:, 0])(x_in) p = Dense(1, activation='sigmoid')(x_in) self.model = Model([x1_in, x2_in], p) else: self.model = self.albert_model self.model.compile( loss='binary_crossentropy', # optimizer=RAdam(1e-5), # 用足够小的学习率 optimizer=PiecewiseLinearLearningRate(Adam(1e-5), { 1000: 1e-5, 2000: 6e-5 }), metrics=[ 'accuracy', process.get_precision, process.get_recall, process.get_f1 ]) self.model.summary() self.model.fit_generator( train_D.__iter__(), steps_per_epoch=len(train_D), epochs=CONFIG['epochs'], validation_data=valid_D.__iter__(), validation_steps=len(valid_D), callbacks=callbacks, use_multiprocessing=CONFIG['use_multiprocessing'], ) def predict(self, test_data): """ 预测 :param test_data: :return: """ X1 = [] X2 = [] for s in test_data: x1, x2 = self.tokenizer.encode(first=s[:CONFIG['max_len']]) X1.append(x1) X2.append(x2) X1 = seq_padding(X1) X2 = seq_padding(X2) predict_results = self.model.predict([X1, X2]) return predict_results def load(self, model_path): """ load the pre-trained model """ try: self.albert_model = load_model(str(model_path), custom_objects=get_custom_objects(), compile=False) except Exception as ex: print('load error') return self