def build_bert(self, verbose=True): """ build bert + crf model for sequence model """ # bert inputs bert_word_ids = Input(batch_shape=(None, self._params.max_sent_len), dtype="int32", name="bert_word_input") bert_mask_ids = Input(batch_shape=(None, self._params.max_sent_len), dtype="int32", name='bert_mask_input') bert_segment_ids = Input(batch_shape=(None, self._params.max_sent_len), dtype="int32", name="bert_segment_input") inputs = [bert_word_ids, bert_mask_ids, bert_segment_ids] bert_out = BertLayer(n_fine_tune_layers=self._params.n_fine_tune_layers, bert_path=self._params.bert_path, name="bert_layer")([bert_word_ids, bert_mask_ids, bert_segment_ids]) features = bert_out if self._params.use_dict: if verbose: logging.info("use user dict features") dict_ids = Input(batch_shape=(None, self._params.max_sent_len), dtype='int32', name='dict_input') inputs.append(dict_ids) dict_embeddings = Embedding(input_dim=self._params.dict_vocab_size, output_dim=self._params.dict_embedding_dim, mask_zero=True, name='dict_embedding')(dict_ids) features = Concatenate(name="bert_and_dict_features")([features, dict_embeddings]) z = Dense(self._params.fc_dim, activation='relu', name="fc_dense")(features) if self._params.use_crf: if verbose: logging.info('use crf decode layer') crf = CRF(self._params.num_labels, sparse_target=False, learn_mode='marginal', test_mode='marginal', name='crf_out') loss = crf.loss_function pred = crf(z) else: loss = 'categorical_crossentropy' pred = Dense(self._params.num_labels, activation='softmax', name='softmax_out')(z) model = Model(inputs=inputs, outputs=pred) model.summary(print_fn=lambda x: logging.info(x + '\n')) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = AdamWeightDecayOptimizer( learning_rate=1e-5, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) model.compile(loss=loss, optimizer=optimizer) self.model = model
def get_optimizer( network_config, default_optimizer=train.AdadeltaOptimizer(learning_rate=1.0)): """ Return the optimizer given by the input network configuration, or a default optimizer. :param network_config: network configuration :param default_optimizer: default optimization algorithm :return: configured optimizer """ try: optimizer = network_config.optimizer except KeyError: logging.info("Using Adadelta as default optimizer.") return default_optimizer if isinstance(optimizer.lr, numbers.Number): lr = optimizer.lr else: optimizer.lr.num_train_steps = network_config.max_steps optimizer.lr.steps_per_epoch = network_config.steps_per_epoch lr = get_learning_rate(optimizer.lr, train.get_global_step()) name = optimizer.name params = optimizer.params if "Adadelta" == name: opt = train.AdadeltaOptimizer(lr, **params) elif "Adam" == name: opt = train.AdamOptimizer(lr, **params) elif "LazyAdam" == name: opt = LazyAdamOptimizer(lr, **params) elif "LazyNadam" == name: opt = LazyNadamOptimizer(lr, **params) elif "SGD" == name: opt = train.GradientDescentOptimizer(lr) elif "Momentum" == name: opt = train.MomentumOptimizer(lr, **params) elif "Nadam" == name: opt = NadamOptimizerSparse(lr, **params) elif "bert" == name: opt = AdamWeightDecayOptimizer( lr, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) else: raise ValueError("Invalid optimizer name: {}".format(name)) return opt
def create_model(bert_module_path, learning_rate=2e-5, max_seq_length=256, n_tune_layers=3, n_classes=20, optimizer="adam"): adam = Adam(learning_rate=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-6, amsgrad=True) adamW = AdamWeightDecayOptimizer( learning_rate, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) input_ids = Input(shape=(max_seq_length, ), name="input_ids") input_mask = Input(shape=(max_seq_length, ), name="input_masks") input_segment = Input(shape=(max_seq_length, ), name="segment_ids") bert_inputs = [input_ids, input_mask, input_segment] bert = BertLayer( bert_module_path, seq_len=max_seq_length, pooling= 'cls', # pooling='cls' returns pooled output, otherwise returns seqs n_tune_layers=n_tune_layers, use_layers=12, trainable=True, verbose=True) dropout = Dropout(0.1) preds = Dense(n_classes, activation='softmax')(dropout(bert(bert_inputs))) model = Model(inputs=bert_inputs, outputs=preds) model.compile(loss='categorical_crossentropy', optimizer=adam if optimizer == "adam" else adamW, metrics=['acc']) model.summary() return model
def build_model(maxlen): bert_path = 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1' input_word_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32) input_mask = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32) segment_ids = tf.keras.layers.Input(shape=(maxlen,), dtype=tf.int32) bert_layer = hub.KerasLayer(bert_path, trainable=True) pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids]) hid = tf.keras.layers.Reshape((maxlen, 768))(sequence_output) hid = tf.keras.layers.Flatten()(hid) hid = tf.keras.layers.Dense(128)(hid) out_begin = tf.keras.layers.Dense(maxlen, name='Begin')(hid) out_end = tf.keras.layers.Dense(maxlen, name='End')(hid) model = tf.keras.models.Model([input_word_ids, input_mask, segment_ids], [out_begin, out_end]) opt = AdamWeightDecayOptimizer(learning_rate=3e-5) model.compile(opt, 'sparse_categorical_crossentropy', ['acc']) model.summary() return model
def create_custom_optimizer(tvars, loss, bert_init_lr, task_init_lr, num_train_steps, num_warmup_steps, use_tpu, global_step=None, freeze=-1, task_opt='adam', eps=1e-6): """Creates an optimizer training op.""" if global_step is None: global_step = tf.train.get_or_create_global_step() bert_learning_rate = tf.constant(value=bert_init_lr, shape=[], dtype=tf.float32) task_learning_rate = tf.constant(value=task_init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. bert_learning_rate = tf.train.polynomial_decay( bert_learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) task_learning_rate = tf.train.polynomial_decay( task_learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float bert_warmup_learning_rate = bert_init_lr * warmup_percent_done task_warmup_learning_rate = task_init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) bert_learning_rate = ( (1.0 - is_warmup) * bert_learning_rate + is_warmup * bert_warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) bert_optimizer = AdamWeightDecayOptimizer( learning_rate=bert_learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=eps, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if task_opt == 'adam_weight_decay': task_optimizer = AdamWeightDecayOptimizer( learning_rate=task_learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=eps ) elif task_opt == 'adam': task_optimizer = tf.train.AdamOptimizer( learning_rate=task_learning_rate) else: raise NotImplementedError('Check optimizer. {} is invalid.'.format(task_opt)) # tvars = tf.trainable_variables() bert_vars, task_vars = [], [] for var in tvars: if var.name.startswith('bert'): can_optimize = False if var.name.startswith('bert/encoder/layer_') and int(var.name.split('/')[2][len('layer_'):]) >= freeze: can_optimize = True if freeze == -1 or can_optimize: bert_vars.append(var) else: task_vars.append(var) print('bert:task', len(bert_vars), len(task_vars)) grads = tf.gradients(loss, bert_vars + task_vars) bert_grads = grads[:len(bert_vars)] task_grads = grads[len(bert_vars):] # This is how the model was pre-trained. (bert_grads, _) = tf.clip_by_global_norm(bert_grads, clip_norm=1.0) (task_grads, _) = tf.clip_by_global_norm(task_grads, clip_norm=1.0) # global_step1 = tf.Print(global_step, [global_step], 'before') bert_train_op = bert_optimizer.apply_gradients( zip(bert_grads, bert_vars), global_step=global_step) task_train_op = task_optimizer.apply_gradients( zip(task_grads, task_vars), global_step=global_step) if task_opt == 'adam_weight_decay': new_global_step = global_step + 1 train_op = tf.group(bert_train_op, task_train_op, [global_step.assign(new_global_step)]) else: train_op = tf.group(bert_train_op, task_train_op) return train_op
def create_optimizer_bplayer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, bplayer): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if use_tpu: optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # tvars = tf.trainable_variables() # grads = tf.gradients(loss, tvars) with tf.variable_scope("backward_gradients"): grads_vals = bplayer.backward_gradients() grads, tvars = zip(*grads_vals) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op
def train(parameters, train_ds, val_ds, wordvec, class_weights): tf.enable_eager_execution() tf.logging.set_verbosity(tf.logging.ERROR) random_seed.set_random_seed(parameters['seed']) (device, data_format) = ('/gpu:0', 'channels_first') if parameters['no_gpu'] > 0 or not tf.test.is_gpu_available(): (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) model = HAN(wordvec, parameters) optimizer = AdamWeightDecayOptimizer( learning_rate=parameters['learning_rate'], weight_decay_rate=0.0, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) timestamp = datetime.now().strftime(' %d%m%y %H%M%S') # Create and restore checkpoint (if one exists on the path) checkpoint_prefix = os.path.join(parameters['model_dir'], 'ckpt') step_counter = tf.train.get_or_create_global_step() checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer, step_counter=step_counter) best_acc_ep = (0.0, -1, float('inf')) # acc, epoch, loss patience = 0 with tf.device(device): for ep in range(parameters['train_epochs']): start = time.time() train_step(model, optimizer, train_ds, step_counter, ep, class_weights, parameters, parameters['log_interval']) val_acc, val_loss = test(model, val_ds, class_weights, ds_name='Val') end = time.time() print('\n Epoch: {} \tTime: {:.6f}'.format(ep + 1, end - start)) parameters['val_losses'].append(val_loss) if val_loss.numpy() < best_acc_ep[2]: best_acc_ep = (val_acc.numpy(), ep, val_loss.numpy()) print('Save checkpoint', checkpoint_prefix) checkpoint.save(checkpoint_prefix) # else: # if patience == parameters['patience']: # print('Apply early stopping') # break # patience += 1 # print('patience {}/{}'.format(patience, parameters['patience'])) print('Min loss {:.6f}, dev acc. {:.3f}%, ep {} \n'.format( best_acc_ep[2], best_acc_ep[0] * 100., best_acc_ep[1] + 1)) model._name = "Hybrid Attention Network" model.summary() plt.ylabel('Training/Validation Loss') plt.xlabel('Number of Epochs') plt.plot(parameters['train_losses'], label="Train Loss") plt.plot(parameters['val_losses'], label="Validation Loss") plt.legend() plt.show() plt.savefig('han_training_curve.png') plt.gcf().clear()
tf.logging.set_verbosity(tf.logging.ERROR) print('Load dataset..', params['data_path']) dataset = pickle.load(open(params['data_path'], 'rb')) train_ds, val_ds, test_ds = dataset.get_dataset(params['batch_size'], params['max_date_len'], params['max_news_len']) train(params, train_ds, val_ds, dataset.wordvec, dataset.class_weights) model = HAN(dataset.wordvec, params) optimizer = AdamWeightDecayOptimizer( learning_rate=params['learning_rate'], weight_decay_rate=0.0, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) timestamp = datetime.now().strftime(' %d%m%y %H%M%S') checkpoint_prefix = os.path.join(params['model_dir'], 'ckpt') step_counter = tf.train.get_or_create_global_step() checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer, step_counter=step_counter) latest_checkpoint = tf.train.latest_checkpoint(params['model_dir']) print('Load the last checkpoint..', latest_checkpoint) checkpoint.restore(latest_checkpoint)
def run(flags_obj): tf.enable_eager_execution() random_seed.set_random_seed(flags_obj.seed) # Automatically determine device and data_format (device, data_format) = ('/gpu:0', 'channels_first') if flags_obj.no_gpu > 0 or not tf.test.is_gpu_available(): (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) print('Load dataset..', flags_obj.pickle_path) dataset = pickle.load(open(flags_obj.pickle_path, 'rb')) train_ds, dev_ds, test_ds = dataset.get_dataset(flags_obj.batch_size, flags_obj.max_date_len, flags_obj.max_news_len) model = HAN(dataset.wordvec, flags_obj) # optimizer = tf.train.AdamOptimizer(learning_rate=flags_obj.learning_rate) optimizer = AdamWeightDecayOptimizer( learning_rate=flags_obj.learning_rate, weight_decay_rate=0.0, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) # Create file writers for writing TensorBoard summaries. timestamp = datetime.now().strftime(' %d%m%y %H%M%S') if flags_obj.output_dir: # Create directories to which summaries will be written # tensorboard --logdir=<output_dir> # can then be used to see the recorded summaries. train_dir = os.path.join(flags_obj.output_dir, 'han train' + timestamp) dev_dir = os.path.join(flags_obj.output_dir, 'han dev' + timestamp) test_dir = os.path.join(flags_obj.output_dir, 'han test' + timestamp) tf.gfile.MakeDirs(flags_obj.output_dir) else: train_dir = None dev_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) dev_summary_writer = tf.contrib.summary.create_file_writer( dev_dir, flush_millis=10000, name='dev') test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') # Create and restore checkpoint (if one exists on the path) checkpoint_prefix = os.path.join(flags_obj.model_dir, 'ckpt') step_counter = tf.train.get_or_create_global_step() checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer, step_counter=step_counter) best_acc_ep = (0.0, -1, 9999.9) # acc, epoch, loss patience = 0 with tf.device(device): for ep in range(flags_obj.train_epochs): start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, step_counter, ep, dataset.class_weights, flags_obj.log_interval) end = time.time() print('\nTrain time for epoch #%d (%d total steps): %.3f sec' % (ep + 1, step_counter.numpy(), end - start)) with dev_summary_writer.as_default(): dev_acc, dev_loss = test(model, dev_ds, dataset.class_weights, ds_name='Dev') if dev_loss.numpy() < best_acc_ep[2]: best_acc_ep = (dev_acc.numpy(), ep, dev_loss.numpy()) print('Save checkpoint', checkpoint_prefix) checkpoint.save(checkpoint_prefix) else: if patience == flags_obj.patience: print('Apply early stopping') break patience += 1 print('patience {}/{}'.format(patience, flags_obj.patience)) print('Min loss {:.6f}, dev acc. {:.3f}%, ep {} \n'.format( best_acc_ep[2], best_acc_ep[0] * 100., best_acc_ep[1] + 1)) latest_checkpoint = tf.train.latest_checkpoint(flags_obj.model_dir) print('Load the last checkpoint..', latest_checkpoint) checkpoint.restore(latest_checkpoint) with test_summary_writer.as_default(): test_acc, test_loss = test(model, test_ds, dataset.class_weights, show_classification_report=True) return \ test_acc, test_loss, best_acc_ep[1] + 1, \ get_num_trainable_params(model)