def test_legacy(self): opt = AdamWarmup( decay_steps=10000, warmup_steps=5000, learning_rate=1e-3, ) if not TF_KERAS: opt.lr = opt.lr
def test_fit_embed(self): model = keras.models.Sequential() model.add( keras.layers.Embedding( input_shape=(None, ), input_dim=5, output_dim=16, mask_zero=True, )) model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=8))) model.add(keras.layers.Dense(units=2, activation='softmax')) model.compile(AdamWarmup( decay_steps=10000, warmup_steps=5000, learning_rate=1e-3, min_lr=1e-4, amsgrad=True, weight_decay=1e-3, ), loss='sparse_categorical_crossentropy') x = np.random.randint(0, 5, (1024, 15)) y = (x[:, 1] > 2).astype('int32') model.fit(x, y, epochs=10, verbose=1) model_path = os.path.join(tempfile.gettempdir(), 'test_warmup_%f.h5' % np.random.random()) model.save(model_path) from tensorflow.python.keras.utils.generic_utils import CustomObjectScope with CustomObjectScope({ 'AdamWarmup': AdamWarmup }): # Workaround for incorrect global variable used in keras keras.models.load_model(model_path, custom_objects={'AdamWarmup': AdamWarmup})
def test_fit_embed(self): model = keras.models.Sequential() model.add( keras.layers.Embedding( input_shape=(None, ), input_dim=5, output_dim=16, mask_zero=True, )) model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=8))) model.add(keras.layers.Dense(units=2, activation='softmax')) model.compile(AdamWarmup( decay_steps=10000, warmup_steps=5000, lr=1e-3, min_lr=1e-4, amsgrad=True, weight_decay=1e-3, ), loss='sparse_categorical_crossentropy') x = np.random.randint(0, 5, (1024, 15)) y = (x[:, 1] > 2).astype('int32') model.fit(x, y, epochs=10) model_path = os.path.join(tempfile.gettempdir(), 'test_warmup_%f.h5' % np.random.random()) model.save(model_path) keras.models.load_model(model_path, custom_objects={'AdamWarmup': AdamWarmup})
def Graph(total_steps, warmup_steps, lr=1e-3, min_lr=1e-5): with graph.as_default(): bert_model = load_trained_model_from_checkpoint( config_path, checkpoint_path) for l in bert_model.layers: l.trainable = True x_in = Input(shape=(None, )) c_in = Input(shape=(None, )) start_in = Input(shape=(None, )) end_in = Input(shape=(None, )) x, c, start, end = x_in, c_in, start_in, end_in x_mask = Lambda( lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x) x = bert_model([x, c]) x_s = Attention(16, 48)([x, x, x, x_mask, x_mask]) x_s = Lambda(lambda x: x[0] + x[1])([x, x_s]) x_s = LayerNormalization()(x_s) x_s_co = Dense(768, use_bias=False)(x_s) x_s_out = Lambda(lambda x: x[0] + x[1])([x_s, x_s_co]) x_s_out = LayerNormalization()(x_s_out) x_s_out = Lambda(lambda x: x[0] * x[1])([x_s_out, x_mask]) ps1 = Dense(1, use_bias=False)(x_s_out) ps1 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10)( [ps1, x_mask]) x_e = Attention(16, 48)([x, x, x, x_mask, x_mask]) x_e = Lambda(lambda x: x[0] + x[1])([x, x_e]) x_e = LayerNormalization()(x_e) x_e_co = Dense(768, use_bias=False)(x_e) x_e_out = Lambda(lambda x: x[0] + x[1])([x_e, x_e_co]) x_e_out = LayerNormalization()(x_e_out) x_e_out = Lambda(lambda x: x[0] * x[1])([x_e_out, x_mask]) ps2 = Dense(1, use_bias=False)(x_e_out) ps2 = Lambda(lambda x: x[0][..., 0] - (1 - x[1][..., 0]) * 1e10)( [ps2, x_mask]) test_model = Model([x_in, c_in], [ps1, ps2]) train_model = Model([x_in, c_in, start_in, end_in], [ps1, ps2]) loss1 = K.mean( K.categorical_crossentropy(start_in, ps1, from_logits=True)) ps2 -= (1 - K.cumsum(start, 1)) * 1e10 loss2 = K.mean( K.categorical_crossentropy(end_in, ps2, from_logits=True)) loss = loss1 + loss2 train_model.add_loss(loss) train_model.compile( optimizer=AdamWarmup(total_steps, warmup_steps, lr, min_lr)) train_model.summary() return train_model, test_model
def train(self): x_trn, y_trn = self.train_data['text'][:].values, self.train_data['label'][:].values x_val, y_val = self.dev_data['text'][:].values, self.dev_data['label'][:].values x_test, y_test = self.test_data['text'][:].values, self.test_data['label'][:].values folds, batch_size, steps, max_len = 5, 16, 30, 300 y_vals_vote = np.zeros(len(y_val)) best_score = 0 model = self.create_model() total_steps, warmup_steps = calc_train_steps(num_example=x_trn.shape[0], batch_size=batch_size, epochs=steps, warmup_proportion=0.2) adamwarmup = AdamWarmup(total_steps, warmup_steps, lr=1e-4, min_lr=1e-6) losses = [self.distil_loss(), self.distil_loss(soft=True, T=self.T)] if self.Mode == 'patient': losses.extend([self.distil_loss(soft=True, T=self.T)]) elif self.Mode == 'patient.full': losses.extend([self.distil_loss(soft=True, T=self.T), self.distil_loss(soft=True, T=self.T), self.distil_loss(soft=True, T=self.T)]) model.compile(loss=losses, optimizer=adamwarmup) x1_val_tok, x2_val_tok = sentence2token(x_val, max_len=max_len) knowledge = self.teacher logit, feature10, feature11, feature12 = np.array(knowledge['logit']), \ np.array(knowledge['layer_10']), np.array(knowledge['layer_11']), np.array(knowledge['layer_12']) for epoch in range(steps): # ==========train=========== # generator = batch_iter(x_trn, y_trn, logit, feature10, feature11, feature12, max_len=max_len, batch_size=batch_size) for x1_tok, x2_tok, log, feat10, feat11, feat12, lab in generator: outputs = [np.eye(2)[lab], log] if self.Mode == 'patient': outputs.extend([feat12]) elif self.Mode == 'patient.full': outputs.extend([feat10, feat11, feat12]) model.train_on_batch( [x1_tok, x2_tok], outputs) # ==========eval=========== # y_val_pre = model.predict([x1_val_tok, x2_val_tok])[0] y_val_vote = np.argmax(y_val_pre, -1) # 最大的值所在的索引作为预测结果 f1, auc, acc, recall = score(y_val, y_val_vote) # ==========EarlyStop=========== # if f1 > best_score: patient = 0 best_score = f1 y_vals_vote = y_val_vote model.save_weights('models/distil_bert_model') print('epoch:{}, f1:{}, auc:{}, acc:{}, recall:{}, best_score:{}'.format( epoch, f1, auc, acc, recall, best_score)) patient += 1 if patient >= 5: break # ==========加载最优模型预测测试集=========== # model.load_weights('models/distil_bert_model') x1_test_tok, x2_test_tok = sentence2token(x_test, max_len=max_len) predict = np.argmax(model.predict([x1_test_tok, x2_test_tok])[0], -1) print('final dev score: ', score(y_val, y_vals_vote)) print('final test score: ', score(y_test, predict))
def test_fit_amsgrad(self): self._test_fit( AdamWarmup( decay_steps=10000, warmup_steps=5000, learning_rate=1e-3, min_lr=1e-4, amsgrad=True, weight_decay=1e-3, ))
def test_fit(self): self._test_fit( AdamWarmup( decay_steps=10000, warmup_steps=5000, lr=1e-3, min_lr=1e-4, amsgrad=False, weight_decay=1e-3, ))
def _get_opt(num_example, warmup_proportion=0.1, lr=2e-5, min_lr=None): total_steps, warmup_steps = calc_train_steps( num_example=num_example, batch_size=B_SIZE, epochs=MAX_EPOCH, warmup_proportion=warmup_proportion, ) opt = AdamWarmup(total_steps, warmup_steps, lr=lr, min_lr=min_lr) if cfg.get("accum_step", None) and cfg["accum_step"] > 1: print("[!] using accum_step = {}".format(cfg["accum_step"])) from accum_optimizer import AccumOptimizer opt = AccumOptimizer(opt, steps_per_update=cfg["accum_step"]) return opt
def get_opt(num_example, warmup_proportion=0.1, lr=2e-5, min_lr=None): if cfg["opt"].lower() == "nadam": opt = Nadam(lr=lr) else: total_steps, warmup_steps = calc_train_steps( num_example=num_example, batch_size=B_SIZE, epochs=MAX_EPOCH, warmup_proportion=warmup_proportion, ) opt = AdamWarmup(total_steps, warmup_steps, lr=lr, min_lr=min_lr) return opt
def create_optimizer(num_example, options): total_steps, warmup_steps = calc_train_steps( num_example=num_example, batch_size=options.batch_size, epochs=options.num_train_epochs, warmup_proportion=options.warmup_proportion, ) optimizer = AdamWarmup( total_steps, warmup_steps, lr=options.learning_rate, epsilon=1e-6, weight_decay=0.01, weight_decay_pattern=['embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo'] ) return optimizer
def test_fit(self): x = np.random.standard_normal((1000, 5)) y = np.dot(x, np.random.standard_normal((5, 2))).argmax(axis=-1) model = keras.models.Sequential() model.add( keras.layers.Dense( units=2, input_shape=(5, ), kernel_constraint=keras.constraints.MaxNorm(1000.0), activation='softmax', )) model.compile( optimizer=AdamWarmup( decay_steps=10000, warmup_steps=5000, lr=1e-3, min_lr=1e-4, amsgrad=True, weight_decay=1e-3, ), loss='sparse_categorical_crossentropy', ) model.fit( x, y, batch_size=10, epochs=110, callbacks=[ keras.callbacks.EarlyStopping(monitor='loss', min_delta=1e-4, patience=3) ], ) if not EAGER_MODE: model_path = os.path.join( tempfile.gettempdir(), 'keras_warmup_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model( model_path, custom_objects={'AdamWarmup': AdamWarmup}) results = model.predict(x).argmax(axis=-1) diff = np.sum(np.abs(y - results)) self.assertLess(diff, 100)
def model_build(len_train): global NUM_CLASSES global BATCH_SIZE global NUM_EPOCHS global MIN_LR global LR bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=MAXLEN, trainable=True) x1_in = Input(shape=(None, )) x2_in = Input(shape=(None, )) aux_in = Input(shape=(2, )) inputs = bert_model([x1_in, x2_in]) bert = Lambda(lambda x: x[:, 0])(inputs) dense = concatenate([bert, aux_in]) outputs = Dense(NUM_CLASSES, activation='softmax')(dense) model = Model([x1_in, x2_in, aux_in], outputs) decay_steps, warmup_steps = calc_train_steps( len_train, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, ) model.compile(loss='sparse_categorical_crossentropy', optimizer=AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR, min_lr=MIN_LR, ), metrics=['sparse_categorical_accuracy']) del bert_model gc.collect() return model
def compile_model(self, data_size, loss_fn, metrics): inputs = self.pretrained_model.inputs[:2] dense = self.pretrained_model.get_layer('NSP-Dense').output outputs = keras.layers.Dense(units=2, activation='softmax')(dense) decay_steps, warmup_steps = calc_train_steps( data_size, batch_size=self.batch_size, epochs=self.epochs, ) model = keras.models.Model(inputs, outputs) model.compile( AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=self.lr), loss=loss_fn, metrics=[metrics], ) self.model = model print(self.model.summary()) return self.model
def Graph(total_steps, warmup_steps, lr=1e-3, min_lr=1e-5): with graph.as_default(): x_in = Input(shape=(None,)) # 1行none列,2维 c_in = Input(shape=(None,)) start_in = Input(shape=(None,)) end_in = Input(shape=(None,)) x, c, start, end = x_in, c_in, start_in, end_in # (None,,1) 先增加一个维度变成三维的,然后跟0比较得到一个三维的取值为True/False的表,再把True/False转化成浮点数0./1. x_mask = (lambda x: K.cast(K.greater(K.expand_dims(x, 2), 0), 'float32'))(x) # 加载模型 bert = load_trained_model_from_checkpoint(congig_path, checkpoint_path) for l in bert.layers: l.trainale = True # 生成句向量 x = bert([x, c]) # start index ps1 = Dense(1, use_bias=False)(x) # 加mask:将padding的部分置为很小很小的数 1e10=10的10次方 ps1 = (lambda x: x[0][..., 0] - (1-x[1][..., 0])*1e10)([ps1, x_mask]) # end index ps2 = Dense(1, use_bias=False)(x) ps2 = (lambda x: x[0][..., 0] - (1-x[1][..., 0])*1e10)([ps2, x_mask]) test_model = Model([x_in, c_in], [ps1, ps2]) train_model = Model([x_in, c_in, start_in, end_in], [ps1, [ps2]]) loss_1 = K.mean(K.categorical_crossentropy(start_in, ps1, from_logits=True)) ps2 -= (1-K.cumsum(start, 1))*1e10 loss_2 = K.mean(K.categorical_crossentropy(end_in, ps2, from_logits=True)) loss = loss_1 + loss_2 train_model.add_loss(loss) train_model.compile(optimizer=AdamWarmup(total_steps, warmup_steps, min_lr=min_lr, lr=lr)) train_model.summary() return train_model, test_model
def build(model, num, lr=0.00002): # @title Build Custom Model from tensorflow.python import keras from keras_bert import AdamWarmup, calc_train_steps inputs = model.inputs[:2] dense = model.get_layer('NSP-Dense').output outputs = keras.layers.Dense(units=len(le.classes_), activation='softmax')(dense) decay_steps, warmup_steps = calc_train_steps( num, batch_size=BATCH_SIZE, epochs=EPOCHS, ) model = keras.models.Model(inputs, outputs) for x in range(len(model.layers)): #print(x) model.layers[x].trainable = True ''' model.layers[-3].trainable = True model.layers[-4].trainable = True model.layers[-5].trainable = True model.layers[-6].trainable = True model.layers[-7].trainable = True ''' model.layers[-1].trainable = True model.layers[-2].trainable = True model.compile( AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'], ) return model
def main(): args = get_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) np.random.seed(args.seed) if args.verbose: log.basicConfig(level=log.DEBUG, stream=sys.stdout) else: log.basicConfig(level=log.INFO, stream=sys.stdout) log.info('\n' + tabulate(sorted(vars(args).items()))) set_logger(os.path.join(args.output_dir, args.log_file)) pick_device() data = load_instances(args.dataset, args.label_col) classes = list(sorted(set(data[args.label_col]))) args.n_classes = len(classes) token_dict = load_vocabulary(args.vocab_file) tokenizer = Tokenizer(token_dict) if args.do_train: folds = [i for i in args.train_dataset.split(',')] train_df = data[data['fold'].isin(folds)].reset_index(drop=True) train_generator = TextDataFrameIterator( dataframe=train_df, tokenizer=tokenizer, classes=classes, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=True, seq_len=args.max_seq_length, seed=args.seed, do_lower_case=args.do_lower_case ) folds = [i for i in args.val_dataset.split(',')] val_df = data[data['fold'].isin(folds)].reset_index(drop=True) val_generator = TextDataFrameIterator( dataframe=val_df, tokenizer=tokenizer, classes=classes, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=False, seq_len=args.max_seq_length, do_lower_case=args.do_lower_case ) total_steps, warmup_steps = calc_train_steps( num_example=len(train_df), batch_size=args.batch_size, epochs=args.epochs, warmup_proportion=args.warmup_proportion, ) model = get_model(args) earlystop = callbacks.EarlyStopping( monitor='val_loss', min_delta=K.epsilon(), patience=args.earlystop, verbose=1, mode='auto') best_checkpoint = callbacks.ModelCheckpoint( os.path.join(args.output_dir, args.best_model), save_best_only=True, save_weights_only=False, monitor='val_loss', mode='min', verbose=1) csv_logger = callbacks.CSVLogger(os.path.join(args.output_dir, args.csv_logger)) callbacks_list = [earlystop, best_checkpoint, csv_logger] optimizer = AdamWarmup( decay_steps=total_steps, warmup_steps=warmup_steps, lr=args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, min_lr=1e-5, weight_decay=0.01, weight_decay_pattern=['embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo'] ) model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy']) cw = get_class_weights(data, args.label_col, train_generator.class_indices) model.fit_generator( train_generator, class_weight=cw, use_multiprocessing=False, workers=args.workers, callbacks=callbacks_list, epochs=args.epochs, validation_data=val_generator, verbose=1) if args.do_test: folds = [i for i in args.test_dataset.split(',')] test_df = data[data['fold'].isin(folds)].reset_index(drop=True) test_generator = TextDataFrameIterator( dataframe=test_df, tokenizer=tokenizer, classes=classes, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=False, seq_len=args.max_seq_length, do_lower_case=args.do_lower_case ) print('Load from %s', os.path.join(args.output_dir, args.best_model)) model = load_model(os.path.join(args.output_dir, args.best_model), custom_objects=get_custom_objects()) # model.summary() y_score = model.predict_generator( test_generator, use_multiprocessing=False, workers=args.workers, verbose=1) y_pred = np.argmax(y_score, axis=1) pred_df = pd.DataFrame(y_score, columns=classes) pred_df = pred_df.assign(predictions=[classes[lbl] for lbl in y_pred]) y_true = test_df.loc[:, args.label_col].values y_pred = pred_df['predictions'].values report = pmetrics.classification_report(y_true, y_pred, classes=classes) print(report.summary()) # print('auc', pmetrics.auc(y_true, y_score, y_column=1)[0]) result = pd.concat([test_df, pred_df], axis=1) result.to_csv(os.path.join(args.output_dir, args.test_predictions), index=False) if args.do_predict: test_df = load_instances(args.pred_dataset, args.label_col) test_generator = TextDataFrameIterator( dataframe=test_df, tokenizer=tokenizer, classes=None, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=False, seq_len=args.max_seq_length, do_lower_case=args.do_lower_case ) print('Load from %s', os.path.join(args.output_dir, args.best_model)) model = load_model(os.path.join(args.output_dir, args.best_model), custom_objects=get_custom_objects()) # model.summary() y_score = model.predict_generator( test_generator, use_multiprocessing=False, workers=args.workers, verbose=1) y_pred = np.argmax(y_score, axis=1) pred_df = pd.DataFrame(y_score, columns=classes) pred_df = pred_df.assign(predictions=[classes[lbl] for lbl in y_pred]) result = pd.concat([test_df, pred_df], axis=1) result.to_csv(os.path.join(args.output_dir, args.pred_predictions), index=False) if args.do_debug: for dataset in [args.train_dataset, args.val_dataset, args.test_dataset]: folds = [i for i in dataset.split(',')] print('folds:', folds) sub_df = data[data['fold'].isin(folds)] generator = TextDataFrameIterator( dataframe=sub_df, tokenizer=tokenizer, x_col=args.text_col, y_col=args.label_col, batch_size=args.batch_size, shuffle=False, seq_len=args.max_seq_length, ) for i, ([tokens, _], labels) in enumerate(generator): print(tokens.shape, type(tokens), labels.shape, type(labels)) if i == 2: break
def main(): seq_id, seq_O, seq_P, id_to_label, id_to_term = encode_seq( df_label=df_label, maxlen=MAX_LEN) class Evaluation(Callback): def __init__(self, val_data, interval=1): self.val_data = val_data self.interval = interval self.best_f1 = 0. self.true_vp_val = [ (row["id"], row["OpinionTerms"], row["Polarities"], row['O_start'], row['O_end']) for rowid, row in df_label[ df_label['id'].isin(self.val_data[0])].iterrows() ] def on_epoch_end(self, epoch, log={}): if epoch % self.interval == 0: o_out, p_out = pred_model.predict( self.val_data[1:4], batch_size=BATCH_SIZE) # CRF概率 o_pred = np.argmax(o_out, axis=2) p_pred = np.argmax(p_out, axis=2) texts = [ df_review[df_review['id'] == i]["Reviews"].values[0] for i in self.val_data[0] ] pred_vp_val = decode_seq(self.val_data[0], o_pred, p_pred, id_to_label, texts) precision, recall, f1 = cal_opinion_metrics( pred_vp_val, self.true_vp_val) if f1 > self.best_f1: self.best_f1 = f1 self.model.save_weights( f'./model_op/op_model_0924_viteb.weights') print(f'best = {f1}') tokenizer = BertTokenizer(token_dict) seq_input, seq_seg = bert_text_to_seq(list(df_review["Reviews"]), tokenizer, maxlen=MAX_LEN) true_vp = [(row["id"], row["OpinionTerms"], row["Polarities"], row['O_start'], row['O_end']) for rowid, row in df_label.iterrows()] pred_vp = decode_seq(seq_id, seq_O, seq_P, id_to_label, list(df_review["Reviews"])) cal_opinion_metrics(pred_vp, true_vp) seq_O = to_categorical(seq_O) seq_P = to_categorical(seq_P) df_review['pos_tag'] = df_review['Reviews'].progress_apply(pos_tag) with open('./data/postag2id_0922_laptop_make_up.pkl', 'rb') as f: postag2id = pickle.load(f) df_review['pos_tag'] = df_review['pos_tag'].progress_apply( lambda postag: [postag2id[x] for x in postag]) seq_postag = np.array(df_review['pos_tag'].values.tolist()) view_train, view_val = split_viewpoints(seq_id, seq_input, seq_seg, seq_O, seq_P, seq_postag) print(view_val[0]) print('------------------- 保存验证集的id ---------------------') print('保存final 验证集的val ids') # np.save('./data/final_makeup_laptop_val_ids', view_val[0]) print('------------------- 保存完毕 ---------------------------') # exit() bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) for l in bert_model.layers: l.trainable = True x1_in = Input(shape=(MAX_LEN, ), name='x1_in') x2_in = Input(shape=(MAX_LEN, ), name='x2_in') o_in = Input(shape=( MAX_LEN, len(id_to_term) + 1, ), name='o_in') p_in = Input(shape=( MAX_LEN, len(id_to_label) + 1, ), name='p_in') pos_tag_in = Input(shape=(MAX_LEN, ), name='pos_tag_in') pos_tag_emb = Embedding(len(postag2id), POS_TAG_DIM, trainable=True)(pos_tag_in) x = bert_model([x1_in, x2_in]) x = Concatenate()([x, pos_tag_emb]) p_out = Dense(len(id_to_label) + 1, activation='softmax')(x) # p_out 是极性的输出 crf = CRF(len(id_to_term) + 1) o_out = crf(x) loss_seq_O = crf.loss_function(o_in, o_out) # 直接加入 Lambda层后 计算图会出错 loss_seq_O = Lambda(lambda x: K.mean(x))(loss_seq_O) # loss_seq_O = Lambda(lambda x: K.mean(categorical_crossentropy(x[0], x[1])), name='loss_seq_O')([o_in, o_out]) loss_p = Lambda(lambda x: K.mean(categorical_crossentropy(x[0], x[1])), name='loss_c')([p_in, p_out]) train_model = Model([x1_in, x2_in, pos_tag_in, o_in, p_in], [o_out, p_out]) pred_model = Model([x1_in, x2_in, pos_tag_in], [o_out, p_out]) train_model._losses = [] train_model._per_input_losses = {} train_model.add_loss(loss_seq_O) train_model.add_loss(loss_p) print(view_train[0].shape[0]) total_steps, warmup_steps = calc_train_steps( num_example=view_train[0].shape[0], batch_size=BATCH_SIZE, epochs=EPOCHS, warmup_proportion=0.1, ) # optimizer = Adam(lr=1e-5) optimizer = AdamWarmup(total_steps, warmup_steps, lr=5e-5, min_lr=1e-6) train_model.compile(optimizer=optimizer) train_model.metrics_tensors.append(loss_seq_O) train_model.metrics_names.append('loss_seq_O') train_model.metrics_tensors.append(loss_p) train_model.metrics_names.append('loss_p') train_model.summary() eval_callback = Evaluation(val_data=view_val) train_model.fit(view_train[1:], epochs=EPOCHS, shuffle=True, batch_size=BATCH_SIZE, callbacks=[eval_callback])
def build_model(args): config = tf.ConfigProto() config.gpu_options.allow_growth = True #config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 K.set_session(tf.Session(config=config)) print("Loading model..") custom_objects = get_custom_objects() bert_model = load_model(args.model, custom_objects=custom_objects) for layer in bert_model.layers: layer.trainable = False input_features = [Input(shape=(get_label_dim(args.train),)) for _ in args.features_train] stacked = Lambda(lambda x: K.stack(x, axis=1))([bert_model.output, *input_features]) stacked = Permute((2, 1), name="stack_permute")(stacked) output_layer = TimeDistributed(Dense(1, activation="tanh", name="decision"))(stacked) output_layer = Flatten(name="time_distributed_flatten")(output_layer) output_layer = Activation("softmax")(output_layer) # The bert model has multiple inputs, so unpack those. model = Model([*bert_model.input, *input_features], output_layer) if args.gpus > 1: template_model = model model = multi_gpu_model(template_model, gpus=args.gpus) callbacks = [Metrics()] if args.patience > -1: callbacks.append(EarlyStopping(patience=args.patience, verbose=1)) if args.checkpoint_interval > 0: callbacks.append(ModelCheckpoint(args.output_file + ".checkpoint-{epoch}", period=args.checkpoint_interval)) total_steps, warmup_steps = calc_train_steps(num_example=get_example_count(args.train), batch_size=args.batch_size, epochs=args.epochs, warmup_proportion=0.01) optimizer = AdamWarmup(total_steps, warmup_steps, lr=args.lr) model.compile(loss=["categorical_crossentropy"], optimizer=optimizer, metrics=[]) print(model.summary(line_length=118)) print("Number of GPUs in use:", args.gpus) print("Batch size:", args.batch_size) print("Learning rate:", args.lr) print("Dropout:", args.dropout) model.fit_generator(data_generator(args.train, args.batch_size, seq_len=args.seq_len, features=args.features_train), steps_per_epoch=ceil( get_example_count(args.train) / args.batch_size ), use_multiprocessing=True, epochs=args.epochs, callbacks=callbacks, validation_data=data_generator(args.dev, args.eval_batch_size, seq_len=args.seq_len, features=args.features_dev), validation_steps=ceil( get_example_count(args.dev) / args.eval_batch_size )) print("Saving model:", args.output_file) if args.gpus > 1: template_model.save(args.output_file) else: model.save(args.output_file)
def main(argv): args = argparser().parse_args(argv[1:]) bert, vocab = load_pretrained(args) tokenizer = Tokenizer(vocab, cased=not args.do_lower_case) labels, train_sents, dev_sents, test_sents = load_data(args) train_data = create_examples(train_sents, tokenizer, labels, args) dev_data = create_examples(dev_sents, tokenizer, labels, args) test_data = create_examples(test_sents, tokenizer, labels, args) output = Dense(len(labels), activation='softmax')(bert.output) model = Model(inputs=bert.inputs, outputs=output) model.summary(line_length=80) train_input = np.array([e.input_ids for e in train_data]) train_in_mask = np.array([e.input_mask for e in train_data]) train_segments = np.array([e.segment_ids for e in train_data]) train_output = np.expand_dims( np.array([e.label_ids for e in train_data]), -1) train_head_flags = np.array([e.head_flags for e in train_data]) total_steps, warmup_steps = calc_train_steps( num_example=len(train_input), batch_size=args.train_batch_size, epochs=args.num_train_epochs, warmup_proportion=0.1, ) optimizer = AdamWarmup( total_steps, warmup_steps, lr=args.learning_rate, weight_decay=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, weight_decay_pattern=['embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo'], min_lr=0 # TODO ) model.compile( loss='sparse_categorical_crossentropy', sample_weight_mode='temporal', optimizer=optimizer ) dev_input = np.array([e.input_ids for e in dev_data]) dev_in_mask = np.array([e.input_mask for e in dev_data]) dev_segments = np.array([e.segment_ids for e in dev_data]) dev_output = np.expand_dims(np.array([e.label_ids for e in dev_data]),-1) dev_head_flags = np.array([e.head_flags for e in dev_data]) train_start = datetime.now() print('start training at', train_start) train_cb = EvaluationCallback( 'train', train_input, train_segments, train_output, train_head_flags) dev_cb = EvaluationCallback( 'dev', dev_input, dev_segments, dev_output, dev_head_flags) callbacks = [train_cb, dev_cb] model.fit( [train_input, train_segments], train_output, sample_weight=train_in_mask, batch_size=args.train_batch_size, epochs=args.num_train_epochs, verbose=1, callbacks=callbacks ) train_end = datetime.now() print('done training', train_end, 'time', train_end-train_start) if args.predict is not None: if args.predict == 'dev': pred_data, pred_sents = dev_data, dev_sents else: assert args.predict == 'test' pred_data, pred_sents = test_data, test_sents pred_input = np.array([e.input_ids for e in pred_data]) pred_segments = np.array([e.segment_ids for e in pred_data]) pred = model.predict( [pred_input, pred_segments], verbose=1 ) pred_tokens = [[t for t, _ in s] for s in pred_sents] pred_head_flags = np.array([e.head_flags for e in pred_data]) write_predictions(pred_tokens, pred_input, pred_head_flags, pred, vocab, labels, args.output) print('best dev result', dev_cb.best, 'for epoch', dev_cb.best_epoch) return 0
# base_model.fit([train_sentence_indices], train_tags, validation_data=([devel_sentence_indices], devel_tags), batch_size=batch_size, epochs=50, verbose=1) print("Loading BERT") total_steps, warmup_steps = calc_train_steps( num_example=len(train_sentences), batch_size=batch_size, epochs=10, warmup_proportion=0.1, ) print(total_steps, warmup_steps) optimizer = AdamWarmup(5 * total_steps, warmup_steps, lr=2e-5, min_lr=2e-7, weight_decay=weight_decay) # import pdb; pdb.set_trace() bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=False, trainable=True, seq_len=max_sequence_len) # bert_model, _ = build_model_from_config(config_path, training=False, trainable=True, seq_len=max_sequence_len) #bert_model.summary(line_length=120) if use_crf: #prediction_layer = layers.Dense(768, activation='tanh')(bert_model.output) prediction_layer = CRF(len(tag_dict),
trainable=True, seq_len=SEQ_LEN, ) inputs = model.inputs[:2] dense = model.get_layer('NSP-Dense').output outputs = keras.layers.Dense(units=1, activation='sigmoid')(dense) model = keras.models.Model(inputs, outputs) total_steps, warmup_steps = calc_train_steps( num_example=x_train[0].shape[0], batch_size=BATCH_SIZE, epochs=EPOCHS, warmup_proportion=0.1, ) optimizer = AdamWarmup(total_steps, warmup_steps, lr=1e-4, min_lr=LR) model.compile( RAdam(LR), loss='binary_crossentropy', metrics=['accuracy'], ) model.summary() sess = K.get_session() uninitialized_variables = set( [i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())]) init_op = tf.variables_initializer([ v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables ])
def manual_train(): #frac = args.frac args = get_args() fold = args.fold EPOCHS = args.epochs BATCH_SIZE = 128 LR = 1e-4 with timed_bolck(f'Prepare train data#{BATCH_SIZE}'): X, y, _ = get_train_test_bert() ##Begin to define model from keras_bert import load_trained_model_from_checkpoint model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True, seq_len=SEQ_LEN, ) from tensorflow.python import keras from keras_bert import AdamWarmup, calc_train_steps inputs = model.inputs[:2] dense_bert = model.get_layer('NSP-Dense').output decay_steps, warmup_steps = calc_train_steps( y.shape[0], batch_size=BATCH_SIZE, epochs=EPOCHS, ) # New input from manual data = get_feature_bert_wv().add_prefix('fea_') manual_fea_len = len([col for col in data.columns if col.startswith('fea_')]) logger.info(f'manual_fea_len:{manual_fea_len}') manual_feature = keras.Input(shape=(manual_fea_len,), name='manual_feature', dtype='float32') inputs = inputs + [manual_feature] manual_feature = keras.layers.Dense(round(num_classes*0.6), name='manual_dense', activation='relu')(manual_feature) manual_feature = keras.layers.Dropout(0.5)(manual_feature) #manual_feature = keras.layers.Dense(round(num_classes), activation='relu')(manual_feature) fc_ex = keras.layers.concatenate([dense_bert, manual_feature], axis=1) # End input from manual #fc_ex = keras.layers.Dense(units=1024, activation='softmax')(fc_ex) outputs = keras.layers.Dense(units=num_classes, activation='softmax')(fc_ex) model = keras.models.Model(inputs, outputs) model.compile( AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR), loss='categorical_crossentropy', metrics=['accuracy'], ) model.summary(line_length=120) ##End to define model input1_col = [col for col in X.columns if str(col).startswith('bert_')] input3_col = [col for col in X.columns if str(col).startswith('fea_')] #max_words = len(input1_col) model #= get_model(max_words) #get_feature_manual.cache_clear() Y_cat = keras.utils.to_categorical(y, num_classes=num_classes) #folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) with timed_bolck(f'Training#{fold}'): from core.split import split_df_by_index train_idx, test_idx = split_df_by_index(X,fold) logger.info(f'Shape train_x.loc[:, input1_col].iloc[:,0]: {X.loc[:, input1_col].iloc[:,0].shape}') train_x, train_y, val_x, val_y = \ X.iloc[train_idx], Y_cat[train_idx], X.iloc[test_idx], Y_cat[test_idx] logger.info(f'get_train_test output: train_x:{train_x.shape}, train_y:{train_y.shape}, val_x:{val_x.shape} ') #for sn in range(5): input1 = train_x.loc[:, input1_col]#.astype(np.float32) input2 = np.zeros_like(input1)#.astype(np.int8) input3 = train_x.loc[:, input3_col] logger.info(f'NN Input1:{input1.shape}, Input2:{input2.shape}, Input3:{input3.shape}') logger.info(f'NN train_x:{train_x[:3]}') from keras_bert import get_custom_objects import tensorflow as tf with tf.keras.utils.custom_object_scope(get_custom_objects()): his = model.fit([input1, input2, input3], train_y, validation_data = ([ val_x.loc[:, input1_col], np.zeros_like(val_x.loc[:, input1_col]), val_x.loc[:, input3_col] ], val_y), epochs=EPOCHS, shuffle=True, batch_size=64, callbacks=[Cal_acc(val_x, y.iloc[test_idx] )] #steps_per_epoch=1000, validation_steps=10 ) #gen_sub(model, X_test, sn) return his
def train(self): x_trn, y_trn = self.train_data['text'][:].values, self.train_data[ 'label'][:].values x_val, y_val = self.dev_data['text'][:].values, self.dev_data[ 'label'][:].values x_test, y_test = self.test_data['text'][:].values, self.test_data[ 'label'][:].values folds, batch_size, steps, max_len = 5, 16, 30, 300 y_vals = np.zeros((len(x_val), 2)) y_vals_vote = np.zeros(len(x_val)) y_test_pre = np.zeros((len(x_test), 2)) knowledge_dict = dict() model = self.create_model() total_steps, warmup_steps = calc_train_steps( num_example=x_trn.shape[0], batch_size=batch_size, epochs=steps, warmup_proportion=0.2) adamwarmup = AdamWarmup(total_steps, warmup_steps, lr=1e-5, min_lr=1e-7) model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-5)) model.save_weights('origin') patient, best_score = 0, -1 x1_trn_tok, x2_trn_tok = sentence2token(x_trn, max_len=max_len) x1_val_tok, x2_val_tok = sentence2token(x_val, max_len=max_len) for epoch in range(steps): # ==========train=========== # generator = batch_iter(x_trn, y_trn, max_len=max_len, batch_size=batch_size) for x1_tok, x2_tok, lab in generator: model.train_on_batch([x1_tok, x2_tok], np.eye(2)[lab]) # ==========eval=========== # y_val_pre = model.predict([x1_val_tok, x2_val_tok]) y_val_vote = np.argmax(y_val_pre, -1) # 最大的值所在的索引作为预测结果 f1, auc, acc, recall = score(y_val, y_val_vote) # ==========EarlyStop=========== # if f1 > best_score: patient = 0 best_score = f1 y_vals_vote = y_val_vote y_vals = y_val_pre model.save_weights('weight') # =========save knowledge========== knowledge_dict = self.save_knowlege(x1_trn_tok, x2_trn_tok, model, knowledge_dict) print('epoch:{}, f1:{}, auc:{}, acc:{}, recall:{}, best_score:{}'. format(epoch, f1, auc, acc, recall, best_score)) patient += 1 if patient >= 5: break # ==========加载最优模型预测测试集=========== # model.load_weights('weight') x1_test_tok, x2_test_tok = sentence2token(x_test, max_len=max_len) predict = np.argmax(model.predict([x1_test_tok, x2_test_tok]), -1) print('final dev score: ', score(y_val, y_vals_vote)) print('final test score: ', score(y_test, predict)) # return y_test_vote, y_vals_vote, y_test, y_vals with open("teacher_knowledge.json", "w") as f: json.dump(knowledge_dict, f)
def manual_train(): #frac = args.frac args = get_args() fold = args.fold EPOCHS = args.epochs BATCH_SIZE = 32 LR = 1e-4 with timed_bolck(f'Prepare train data#{BATCH_SIZE}'): X, y, _ = get_train_test_bert() ##Begin to define model from keras_bert import load_trained_model_from_checkpoint model_bert = load_trained_model_from_checkpoint( config_path, checkpoint_path, training=True, seq_len=SEQ_LEN, ) #model_right = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True, seq_len=SEQ_LEN, ) from tensorflow.python import keras from keras_bert import AdamWarmup, calc_train_steps app_des = model_bert.inputs[:2] dense_app_des = model_bert.get_layer('NSP-Dense').output model_bert = keras.models.Model(inputs=app_des, outputs=dense_app_des, name='bert_output') inputs = [ keras.models.Input(shape=(SEQ_LEN, ), name=f'INPUT-{name}') for name in range(4) ] left = model_bert(inputs[:2]) right = model_bert(inputs[2:]) decay_steps, warmup_steps = calc_train_steps( y.shape[0], batch_size=BATCH_SIZE, epochs=EPOCHS, ) fc_ex = keras.layers.concatenate([left, right], axis=1) #fc_ex = keras.layers.Subtract()([left, right]) # End input from manual #outputs = keras.layers.Dense(units=8, activation='softmax')(fc_ex) outputs = keras.layers.Dense(units=1, activation='sigmoid')(fc_ex) model = keras.models.Model(inputs, outputs) model.compile( AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR), loss='binary_crossentropy', metrics=['accuracy'], ) model.summary(line_length=120) ##End to define model input1_col = [col for col in X.columns if str(col).startswith('bert_')] input3_col = [col for col in X.columns if str(col).startswith('fea_')] #max_words = len(input1_col) model #= get_model(max_words) Y_cat = y with timed_bolck(f'Training#{fold}'): from core.split import split_df_by_index_no_bin train_idx, test_idx = split_df_by_index_no_bin(X, fold) logger.info( f'Shape train_x.loc[:, input1_col].iloc[:,0]: {X.loc[:, input1_col].iloc[:,0].shape}' ) train_x, train_y, val_x, val_y = \ X.iloc[train_idx], Y_cat[train_idx], X.iloc[test_idx], Y_cat[test_idx] logger.info( f'get_train_test output: train_x:{train_x.shape}, train_y:{train_y.shape}, val_x:{val_x.shape} ' ) #for sn in range(5): input1 = train_x.loc[:, input1_col] #.astype(np.float32) input2 = np.zeros_like(input1) #.astype(np.int8) input3 = train_x.loc[:, input3_col] input4 = np.zeros_like(input3) logger.info( f'NN Input1:{input1.shape}, Input2:{input2.shape}, Input3:{input3.shape}' ) logger.info(f'NN train_x:{train_x[:3]}') from keras_bert import get_custom_objects import tensorflow as tf with tf.keras.utils.custom_object_scope(get_custom_objects()): his = model.fit([input1, input2, input3, input4], train_y, validation_data=([ val_x.loc[:, input1_col], np.zeros_like(val_x.loc[:, input1_col]), val_x.loc[:, input3_col], np.zeros_like(val_x.loc[:, input3_col]), ], val_y), epochs=EPOCHS, shuffle=True, batch_size=64, callbacks=[Cal_acc(val_x, y.iloc[test_idx])] #steps_per_epoch=1000, validation_steps=10 ) #gen_sub(model, X_test, sn) return his
test_x, test_y = load_data(test_path) # 定义自定义模型 inputs = model.inputs[:2] bert_out_seq = model.get_ dense = model.get_layer('NSP-Dense').output # 获取'NSP-Dense'层的输出 outputs = keras.layers.Dense(units=2, activation='softmax')(dense) # 稠密层 + softmax decay_steps, warmup_steps = calc_train_steps( # 指数衰减步数,热启动步数 train_y.shape[0], batch_size=BATCH_SIZE, epochs=EPOCHS, ) model = keras.models.Model(inputs, outputs) model.compile( # 编译模型以供训练 AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'], ) # 初始化所有变量 sess = K.get_session() uninitialized_variables = set([i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())]) init_op = tf.variables_initializer( [v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables] ) sess.run(init_op) # 转为 tpu model tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR'] strategy = tf.contrib.tpu.TPUDistributionStrategy(
print("Number of LR decay steps: {0} \nNumber of warm-up steps: {1}".format( decay_steps, warmup_steps)) # Next we read the BERT model that we just loaded: config_file = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json') checkpoint_file = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt') bert_model = load_trained_model_from_checkpoint(config_file, checkpoint_file, training=True, seq_len=max_seq_len) print("Lookup model architecture with: bert_model.summary()") print("I dare ya'") # Initialize custom Adam optimizer with warmup: adam_warmup = AdamWarmup(lr=learning_rate, decay_steps=decay_steps, warmup_steps=warmup_steps, weight_decay=weight_decay) # Picking BERT layers and building output layers: input_layer = bert_model.input embedding_output = bert_model.layers[-6].output output_layer = Dense(1, activation='sigmoid', kernel_initializer=TruncatedNormal(stddev=0.02), name='class_output')(embedding_output) model = Model(inputs=input_layer, outputs=output_layer) model.compile(loss='binary_crossentropy', optimizer=adam_warmup, metrics=["acc"]) model.summary()
model = create_cls_model(len(labels)) train_D = DataGenerator(train_data) test_D = DataGenerator(test_data) print("begin model training...") # 保存最新的val_acc最好的模型文件 filepath = "models/%s-{epoch:02d}-{val_acc:.4f}.h5" % DATA_DIR.split("/")[-1] checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') # add warmup total_steps, warmup_steps = calc_train_steps( num_example=len(train_data), batch_size=BATCH_SIZE, epochs=EPOCH, warmup_proportion=0.1, ) optimizer = AdamWarmup(total_steps, warmup_steps, lr=5e-5, min_lr=1e-7) model.compile( loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'] ) model.fit_generator( train_D.__iter__(), steps_per_epoch=len(train_D), epochs=EPOCH, validation_data=test_D.__iter__(), validation_steps=len(test_D), callbacks=[checkpoint] ) print("finish model training!")
def train_base(): args = get_args() #frac = args.frac fold = args.fold EPOCHS = args.epochs BATCH_SIZE = 128 LR = 1e-4 with timed_bolck(f'Prepare train data#{BATCH_SIZE}'): X, y, _ = get_train_test_bert() ##Begin to define model from keras_bert import load_trained_model_from_checkpoint model = load_trained_model_from_checkpoint( config_path, checkpoint_path, training=True, seq_len=SEQ_LEN, ) model.summary(line_length=120) from tensorflow.python import keras from keras_bert import AdamWarmup, calc_train_steps inputs = model.inputs[:2] dense = model.get_layer('NSP-Dense').output keras.models.Model(inputs, dense).summary() outputs = keras.layers.Dense(units=num_classes, activation='softmax')(dense) decay_steps, warmup_steps = calc_train_steps( y.shape[0], batch_size=BATCH_SIZE, epochs=EPOCHS, ) model = keras.models.Model(inputs, outputs) model.compile( AdamWarmup(decay_steps=decay_steps, warmup_steps=warmup_steps, lr=LR), loss='categorical_crossentropy', metrics=['accuracy'], ) ##End to define model input1_col = [col for col in X.columns if str(col).startswith('bert_')] input2_col = [col for col in X.columns if str(col).startswith('fea_')] #max_words = len(input1_col) model #= get_model(max_words) #get_feature_manual.cache_clear() Y_cat = keras.utils.to_categorical(y, num_classes=num_classes) #folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019) with timed_bolck(f'Training#{fold}'): from core.split import split_df_by_index train_idx, test_idx = split_df_by_index(X, fold) logger.info( f'Shape train_x.loc[:, input1_col].iloc[:,0]: {X.loc[:, input1_col].iloc[:,0].shape}' ) train_x, train_y, val_x, val_y = \ X.iloc[train_idx], Y_cat[train_idx], X.iloc[test_idx], Y_cat[test_idx] logger.info( f'get_train_test output: train_x:{train_x.shape}, train_y:{train_y.shape}, val_x:{val_x.shape} ' ) #train_x, train_y = filter_short_desc(train_x, train_y) input1 = train_x.loc[:, input1_col] #.astype(np.float32) input2 = np.zeros_like(input1) #.astype(np.int8) logger.info(f'NN train_x:{train_x[:3]}') min_len_ratio = get_args().min_len_ratio max_bin = get_args().max_bin logger.info( f'NN Input1:{input1.shape}, Input2:{input2.shape}, SEQ_LEN:{SEQ_LEN}, min_len_ratio:{min_len_ratio}, bin:{max_bin} ' ) from keras_bert import get_custom_objects import tensorflow as tf with tf.keras.utils.custom_object_scope(get_custom_objects()): his = model.fit([input1, input2], train_y, validation_data=([ val_x.loc[:, input1_col], np.zeros_like(val_x.loc[:, input1_col]) ], val_y), epochs=EPOCHS, shuffle=True, batch_size=64, callbacks=[Cal_acc(val_x, y.iloc[test_idx])] #steps_per_epoch=1000, validation_steps=10 ) #gen_sub(model, X_test, sn) return his