def compile_model(model, weight_decay=0.01, decay_steps=100000, warmup_steps=10000, learning_rate=1e-4): """Compile the model with warmup optimizer and sparse cross-entropy loss. :param model: The built model. :param weight_decay: Weight decay rate. :param decay_steps: Learning rate will decay linearly to zero in decay steps. :param warmup_steps: Learning rate will increase linearly to learning_rate in first warmup steps. :param learning_rate: Learning rate. :return: The compiled model. """ model.compile( optimizer=AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, learning_rate=learning_rate, weight_decay=weight_decay, weight_decay_pattern=[ 'embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo' ], ), loss=keras.losses.sparse_categorical_crossentropy, )
def test_fit(self): x = np.random.standard_normal((1000, 5)) y = np.dot(x, np.random.standard_normal((5, 2))).argmax(axis=-1) model = keras.models.Sequential() model.add( keras.layers.Dense( units=2, input_shape=(5, ), kernel_constraint=keras.constraints.MaxNorm(1000.0), activation='softmax', )) model.compile( optimizer=AdamWarmup( decay_steps=10000, warmup_steps=5000, lr=1e-3, min_lr=1e-4, amsgrad=True, kernel_weight_decay=1e-3, bias_weight_decay=1e-4, ), loss='sparse_categorical_crossentropy', ) model.fit(x, y, batch_size=10, epochs=110) model_path = os.path.join(tempfile.gettempdir(), 'keras_warmup_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model( model_path, custom_objects={'AdamWarmup': AdamWarmup}) results = model.predict(x).argmax(axis=-1) diff = np.sum(np.abs(y - results)) self.assertLess(diff, 100)
def pretrain_bert(args, training=True): bert_model = load_trained_model_from_checkpoint( args.config_path, args.checkpoint_path, seq_len=None, training=training) #加载预训练模型 for l in bert_model.layers: l.trainable = True x1_in = Input(shape=(None, )) x2_in = Input(shape=(None, )) x3_in = Input(shape=(None, )) x = bert_model([x1_in, x2_in, x3_in]) model = Model([x1_in, x2_in, x3_in], x) model.compile( optimizer=AdamWarmup( decay_steps=100000, warmup_steps=10000, learning_rate=1e-4, weight_decay=0.01, weight_decay_pattern=[ 'embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo' ], ), loss=keras.losses.sparse_categorical_crossentropy, ) print(model.summary()) return model
def train_on_all_set(): train_config = get_config() bert_config = get_bert_config(train_config) import pickle with open('tok_text_uncased.pkl', 'rb') as h: text = pickle.load(h) with open('y_train.pkl', 'rb') as h: label = pickle.load(h) with open('y_aux.pkl', 'rb') as h: aux = pickle.load(h) iden_df = pd.read_csv('processed_data/train_tok_iden.csv') weights = get_weights_new(iden_df) del iden_df lw = 1 / np.mean(weights) train_seg = [[0 for _ in t] for t in text] train_gen = GeneralDataGenerator( inputs=[text, train_seg], outputs=[label, aux], sample_weights=[weights, np.ones_like(weights)], batch_size=32, pad_fn=[ lambda x: seq_padding(x, truncate=False), lambda x: seq_padding(x, truncate=False) ]) # train_gen = AllDataGenerator(text, label, aux, sample_weight) with tf.device('/cpu:0'): model = get_bert_multi_model(bert_config) # model.load_weights('save_models/bert.weights.h5') # OPTIMIZER PARAMs lr = 2e-5 weight_decay = 0.01 bsz = 32 decay_steps = 1 * len(train_gen) warmup_steps = int(0.1 * decay_steps) optimizer = AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr, weight_decay=weight_decay, ) parallel_model = multi_gpu_model(model, gpus=2) parallel_model.compile(loss='binary_crossentropy', optimizer=optimizer, loss_weights=[lw, 1.]) parallel_model.fit_generator( train_gen.__iter__(), steps_per_epoch=len(train_gen), epochs=1, max_queue_size=100, ) model.save('save_models/bert.weights-uncased-new_weight_all.h5') print("DONE")
def train_on_train_test_split(): train_config = get_config() bert_config = get_bert_config(train_config) cased = train_config.BERT_DIR.split('/')[-1].startswith('cased') tokenizer = FullTokenizer(bert_config.vocab, do_lower_case=cased) with tf.device('/cpu:0'): model = get_bert_base_model(bert_config) text, label = load_data(os.path.join(train_config.DATA_DIR, 'train.csv')) train_text, val_text, train_label, val_label = train_test_split( text, label, test_size=0.055, random_state=59) train_gen = DataGenerator(train_text, train_label, tokenizer, batch_size=32) val_text = tokenize_examples(val_text, tokenizer, max_len=512) val_text = seq_padding(val_text) logger = Logger(model=model, val_text=val_text, val_label=(val_label > 0.5).astype(np.float32)) # OPTIMIZER PARAMs lr = 2e-5 weight_decay = 0.01 bsz = 32 decay_steps = 1 * len(train_gen) warmup_steps = int(0.1 * decay_steps) optimizer = AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr, weight_decay=weight_decay, ) parallel_model = multi_gpu_model(model, gpus=4) parallel_model.compile(loss='binary_crossentropy', optimizer=optimizer) parallel_model.fit_generator(train_gen.__iter__(), steps_per_epoch=len(train_gen), epochs=1, callbacks=[logger], max_queue_size=100)
def test_fit(self): x = np.random.standard_normal((1000, 5)) y = np.dot(x, np.random.standard_normal((5, 2))).argmax(axis=-1) model = keras.models.Sequential() model.add(keras.layers.Dense( units=2, input_shape=(5,), kernel_constraint=keras.constraints.MaxNorm(1000.0), activation='softmax', )) model.compile( optimizer=AdamWarmup(decay_steps=10000, warmup_steps=5000, lr=1e-3, min_lr=1e-4, amsgrad=True), loss='sparse_categorical_crossentropy', ) model.fit(x, y, batch_size=10, epochs=110) results = model.predict(x).argmax(axis=-1) diff = np.sum(np.abs(y - results)) self.assertLess(diff, 100)
def train_ml_all_set(): train_config = get_config() bert_config = get_bert_config(train_config) import pickle with open('tok_text_uncased.pkl', 'rb') as h: text = pickle.load(h) with open('y_train.pkl', 'rb') as h: label = pickle.load(h) with open('y_aux.pkl', 'rb') as h: aux = pickle.load(h) iden_df = pd.read_csv('processed_data/train_tok_iden.csv') weights = get_weights_new(iden_df) del iden_df train_text, _, train_label, _, train_aux, _, train_weights, _ = train_test_split( text, label, aux, weights, test_size=0.055, random_state=59) train_seg = [[0 for _ in t] for t in train_text] train_gen = GeneralDataGenerator( inputs=[train_text, train_seg], outputs=[train_label, train_aux], sample_weights=[train_weights, np.ones_like(train_weights)], pad_fn=[ lambda x: seq_padding(x, truncate=False), lambda x: seq_padding(x, truncate=False) ], batch_size=64) with tf.device('/cpu:0'): model = get_bert_multi_model(bert_config) # optimizer = Adam(lr=2e-5) # OPTIMIZER PARAMs lr = 2e-5 weight_decay = 0.01 bsz = 32 decay_steps = 1 * len(train_gen) warmup_steps = int(0.1 * decay_steps) optimizer = AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr, weight_decay=weight_decay, weight_decay_pattern=[ 'embeddings', 'kernel', 'W1', 'W2', 'Wk', 'Wq', 'Wv', 'Wo' ], ) parallel_model = multi_gpu_model(model, gpus=4) # parallel_model.compile(loss=[focal_loss(gamma=2., alpha=.25), 'binary_crossentropy'], optimizer=optimizer) parallel_model.compile(loss='binary_crossentropy', optimizer=optimizer) parallel_model.fit_generator( train_gen.__iter__(), steps_per_epoch=len(train_gen), epochs=1, max_queue_size=20, ) model.save('save_models/bert.weights-large-nw.h5') # print('SAVED') # parallel_model.fit_generator(train_gen.__iter__(), # steps_per_epoch=len(train_gen), # epochs=1, # max_queue_size=20, # ) # model.save('save_models/bert.weights-uncased-ml2-e2.h5') print("DONE")
def train_gpt(): bpe = get_bpe_from_files(encoder_path, vocab_path) import pickle # with open('tok_text_uncased.pkl', 'rb') as h: # text = pickle.load(h) with open('y_train.pkl', 'rb') as h: label = pickle.load(h) with open('y_aux.pkl', 'rb') as h: aux = pickle.load(h) iden_df = pd.read_csv('processed_data/train_tok_iden.csv') weights = get_weights_new(iden_df) del iden_df df = pd.read_csv('new_processed_data/train.csv') text = df['comment_text'].values del df train_text, _, train_label, _, train_aux, _, train_weights, _ = train_test_split( text, label, aux, weights, test_size=0.055, random_state=59) def pad_fn(ts): ts = [bpe.encode(t)[:512] for t in ts] return seq_padding(ts, truncate=False) train_gen = GeneralDataGenerator( inputs=[ train_text, ], outputs=[train_label, train_aux], sample_weights=[train_weights, np.ones_like(train_weights)], batch_size=16, pad_fn=[ pad_fn, ]) with tf.device('/cpu:0'): model = get_gpt_model(config_path, checkpoint_path) lr = 2e-5 weight_decay = 0.01 bsz = 32 decay_steps = 2 * len(train_gen) warmup_steps = int(0.05 * decay_steps) optimizer = AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr, weight_decay=weight_decay, ) lw = 1 / np.mean(train_weights) model.load_weights('save_models/gpt.weights-new_weight.h5') parallel_model = multi_gpu_model(model, gpus=2) parallel_model.compile(loss='binary_crossentropy', optimizer=optimizer, loss_weights=[lw, 1.]) parallel_model.fit_generator(train_gen.__iter__(), steps_per_epoch=len(train_gen), epochs=2, max_queue_size=100, initial_epoch=1) model.save('save_models/gpt.weights-new_weight-2.h5') print("DONE")
def train_bert_on_tpu(): df = pd.read_csv('data/train.csv') weight_df = pd.read_csv('data/train_weight.csv') texts = df['comment_text'].values # texts = convert_lines(texts, 512, tokenizer, prunc='ei') label = df['target'].values aux_label = df[AUX_COLUMNS].values import pickle ids = pickle.load(open('data/ids.pkl', 'rb')) weight = get_weights_new_array(ids, label) # weight = weight_df['weight'].values del df del weight_df train_text, _, train_label, _, train_aux, _, train_weights, _ = train_test_split( texts, label, aux_label, weight, test_size=0.055, random_state=59) # train_text = convert_lines(train_text, 512, tokenizer, prunc='ei') train_text = pickle.load(open('train_text_ei.pkl', 'rb')) # pickle.dump(train_text, open('train_text_ei.pkl', 'wb')) lw = 1 / np.mean(train_weights) train_gen = AllDataGenerator(train_text, train_label, train_aux, train_weights, batch_size=64) model = get_bert_multi_model(bert_config) # optimizer = keras.optimizers.Adam(2e-5) # lr = 2e-5 # weight_decay = 0.01 # decay_steps = 1 * len(train_gen) # warmup_steps = int(0.1 * decay_steps) # # optimizer = AdamWarmup( # decay_steps=decay_steps, # warmup_steps=warmup_steps, # lr=lr, # weight_decay=weight_decay, # ) strategy = tf.contrib.tpu.TPUDistributionStrategy( tf.contrib.cluster_resolver.TPUClusterResolver( "node-2", zone="us-central1-b", project='studied-acronym-235702')) with tf.keras.utils.custom_object_scope(get_custom_objects()): lr = 2e-5 weight_decay = 0.01 decay_steps = 1 * len(train_gen) warmup_steps = int(0.1 * decay_steps) optimizer = AdamWarmup( decay_steps=decay_steps, warmup_steps=warmup_steps, lr=lr, weight_decay=weight_decay, ) tpu_model = tf.contrib.tpu.keras_to_tpu_model(model, strategy=strategy) tpu_model.compile(loss='binary_crossentropy', optimizer=optimizer, loss_weights=[lw, 1.]) tpu_model.fit_generator( train_gen.__iter__(), steps_per_epoch=len(train_gen), epochs=1, max_queue_size=100, ) model.save('save_models/bert.weights-ei.h5')