def main(argv): """ This is the main method for training the model. :param argv: training parameters :return: """ # Get executor task type from TF_CONFIG task_type = executor_utils.get_executor_task_type() # Get hyper-parameters. hparams = get_hparams(argv) tf.logging.set_verbosity(tf.logging.INFO) # if epoch is set, overwrite training steps if hparams.num_epochs is not None: hparams.num_train_steps = misc_utils.estimate_train_steps( hparams.train_file, hparams.num_epochs, hparams.train_batch_size, hparams.metadata_path is None) # Create directory and launch tensorboard if task_type == executor_utils.CHIEF or task_type == executor_utils.LOCAL_MODE: # If not resume training from checkpoints, delete output directory. if not hparams.resume_training: logging.info("Removing previous output directory...") if tf.gfile.Exists(hparams.out_dir): tf.gfile.DeleteRecursively(hparams.out_dir) # If output directory deleted or does not exist, create the directory. if not tf.gfile.Exists(hparams.out_dir): logging.info('Creating dirs recursively at: {0}'.format(hparams.out_dir)) tf.gfile.MakeDirs(hparams.out_dir) misc_utils.save_hparams(hparams.out_dir, hparams) # set up logger sys.stdout = logger.Logger(os.path.join(hparams.out_dir, 'logging.txt')) else: # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code. logging.info("Waiting for chief to remove/create directories.") # Wait for dir created form chief time.sleep(10) if task_type == executor_utils.EVALUATOR: # set up logger for evaluator sys.stdout = logger.Logger(os.path.join(hparams.out_dir, 'eval_log.txt')) hparams = misc_utils.extend_hparams(hparams) logging.info("***********DeText Training***********") # Train and evaluate DeText model train.train(hparams)
def run_detext(argument: DetextArg): logging.info(f"Args:\n {argument}") tf.logging.set_verbosity(tf.logging.INFO) # For data sharding when using horovod if argument.use_horovod: import horovod.tensorflow as hvd else: hvd = None # Get executor task type from TF_CONFIG task_type = get_executor_task_type() # Create directory and launch tensorboard master = not hvd or hvd.rank() == 0 if (task_type == CHIEF or task_type == LOCAL_MODE) and master: # If not resume training from checkpoints, delete output directory. if not argument.resume_training: logging.info("Removing previous output directory...") if tf.gfile.Exists(argument.out_dir): tf.gfile.DeleteRecursively(argument.out_dir) # If output directory deleted or does not exist, create the directory. if not tf.gfile.Exists(argument.out_dir): logging.info('Creating dirs recursively at: {0}'.format( argument.out_dir)) tf.gfile.MakeDirs(argument.out_dir) misc_utils.save_hparams(argument.out_dir, HParams(**argument._asdict())) else: # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code. logging.info("Waiting for chief to remove/create directories.") # Wait for dir created form chief time.sleep(10) if (task_type == EVALUATOR or task_type == LOCAL_MODE) and master: # set up logger for evaluator sys.stdout = logger.Logger( os.path.join(argument.out_dir, 'eval_log.txt')) hparams = misc_utils.extend_hparams(HParams(**argument._asdict())) logging.info("***********DeText Training***********") # Train and evaluate DeText model train.train(hparams, input_fn)
def main(_): """ This is the main method for training the model. :param argv: training parameters :return: """ # Get executor task type from TF_CONFIG task_type = executor_utils.get_executor_task_type() # Get hyper-parameters. hparams = get_hparams() tf.logging.set_verbosity(tf.logging.INFO) # if epoch is set, overwrite training steps if hparams.num_epochs is not None: hparams.num_train_steps = misc_utils.estimate_train_steps( hparams.train_file, hparams.num_epochs, hparams.train_batch_size, hparams.metadata_path is None) # if num_eval_rounds is set, override steps_per_eval if hparams.num_eval_rounds is not None: hparams.steps_per_eval = max( 1, int(hparams.num_train_steps / hparams.num_eval_rounds)) # For data sharding when using horovod hparams.add_hparam("hvd_info", None) if hparams.use_horovod: import horovod.tensorflow as hvd hvd.init() hparams.num_train_steps = hparams.num_train_steps // hvd.size() hparams.num_warmup_steps = hparams.num_warmup_steps // hvd.size() hparams.steps_per_eval = hparams.steps_per_eval // hvd.size() hparams.steps_per_stats = hparams.steps_per_stats // hvd.size() hparams.hvd_info = {'rank': hvd.rank(), 'size': hvd.size()} # Create directory and launch tensorboard if ((task_type == executor_utils.CHIEF or task_type == executor_utils.LOCAL_MODE) and (not hparams.use_horovod or (hparams.use_horovod and hvd.rank() == 0))): # If not resume training from checkpoints, delete output directory. if not hparams.resume_training: logging.info("Removing previous output directory...") if tf.gfile.Exists(hparams.out_dir): tf.gfile.DeleteRecursively(hparams.out_dir) # If output directory deleted or does not exist, create the directory. if not tf.gfile.Exists(hparams.out_dir): logging.info('Creating dirs recursively at: {0}'.format( hparams.out_dir)) tf.gfile.MakeDirs(hparams.out_dir) misc_utils.save_hparams(hparams.out_dir, hparams) else: # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code. logging.info("Waiting for chief to remove/create directories.") # Wait for dir created form chief time.sleep(10) if ((task_type == executor_utils.EVALUATOR or task_type == executor_utils.LOCAL_MODE) and (not hparams.use_horovod or (hparams.use_horovod and hvd.rank() == 0))): # set up logger for evaluator sys.stdout = logger.Logger( os.path.join(hparams.out_dir, 'eval_log.txt')) hparams = misc_utils.extend_hparams(hparams) logging.info("***********DeText Training***********") # Train and evaluate DeText model train.train(hparams, input_fn)
def train(hparams, input_fn): """ Main function for train/evaluate DeText ranking model :param hparams: hparams :param input_fn: input function to create train/eval specs :return: """ eval_log_file = None if hparams.use_horovod is True: import horovod.tensorflow as hvd eval_log_file = path_join(hparams.out_dir, 'eval_log.txt') train_strategy = tf.contrib.distribute.ParameterServerStrategy() estimator = get_estimator(hparams, strategy=train_strategy) # Set model export config for evaluator or primary worker of horovod exporter_list = None if hparams.use_horovod is False or (hparams.use_horovod is True and hvd.rank() == 0): best_model_name = 'best_' + hparams.pmetric # Exporter to save best (in terms of pmetric) checkpoint in the folder [best_model_name], # and export to savedmodel for prediction. best_checkpoint_exporter = BestCheckpointCopier( name=best_model_name, serving_input_receiver_fn=lambda: serving_input_fn(hparams), checkpoints_to_keep=1, # keeping the best checkpoint exports_to_keep=1, # keeping the best savedmodel pmetric='metric/{}'.format(hparams.pmetric), compare_fn=lambda x, y: x.score > y.score, # larger metric better sort_reverse=True, eval_log_file=eval_log_file) exporter_list = [best_checkpoint_exporter] # Handle sync distributed training case via use_horovod if hparams.use_horovod: import horovod.tensorflow as hvd # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from # rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights or # restored from a checkpoint. bcast_hook = hvd.BroadcastGlobalVariablesHook(0) # Create TrainSpec for model training train_spec = tf.estimator.TrainSpec( input_fn=lambda: input_fn( input_pattern=hparams.train_file, metadata_path=hparams.metadata_path, batch_size=hparams.train_batch_size, mode=tf.estimator.ModeKeys.TRAIN, vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams. UNK), vocab_table_for_id_ftr=vocab_utils.read_tf_vocab( hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR), feature_names=hparams.feature_names, CLS=hparams.CLS, SEP=hparams.SEP, PAD=hparams.PAD, PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR, max_len=hparams.max_len, min_len=hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0, # Add horovod information if applicable hvd_info=hparams.hvd_info if hparams.use_horovod else None), hooks=[bcast_hook] if hparams.use_horovod else None, # Ensure proper initialization with horovod max_steps=hparams.num_train_steps) eval_spec = tf.estimator.EvalSpec( input_fn=lambda: input_fn( input_pattern=hparams.dev_file, metadata_path=hparams.metadata_path, batch_size=hparams.test_batch_size, mode=tf.estimator.ModeKeys.EVAL, vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams. UNK), vocab_table_for_id_ftr=vocab_utils.read_tf_vocab( hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR), feature_names=hparams.feature_names, CLS=hparams.CLS, SEP=hparams.SEP, PAD=hparams.PAD, PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR, max_len=hparams.max_len, min_len=hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0), exporters=exporter_list, steps=None, # Set throttle_secs to 10 min to avoid warning to spam logs # Set throttle to 0 for horovod: https://github.com/horovod/horovod/issues/182#issuecomment-533897757 throttle_secs=0 if hparams.use_horovod else 600, start_delay_secs=10) # Training and evaluation with dev set tf.estimator.train_and_evaluate(estimator=estimator, train_spec=train_spec, eval_spec=eval_spec) print("***** Training finished. *****") # Evaluation with test set: create an estimator with the best_checkpoint_dir to load the best model task_type = executor_utils.get_executor_task_type() do_evaluate = task_type == executor_utils.EVALUATOR or task_type == executor_utils.LOCAL_MODE if (not hparams.use_horovod and do_evaluate) or (hparams.use_horovod and hvd.rank() == 0): best_checkpoint_dir = path_join(hparams.out_dir, best_model_name) estimator_savedmodel = get_estimator( hparams, strategy=train_strategy, best_checkpoint=best_checkpoint_dir) result = estimator_savedmodel.evaluate(input_fn=lambda: input_fn( input_pattern=hparams.test_file, metadata_path=hparams.metadata_path, batch_size=hparams.test_batch_size, mode=tf.estimator.ModeKeys.EVAL, vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams. UNK), vocab_table_for_id_ftr=vocab_utils.read_tf_vocab( hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR), feature_names=hparams.feature_names, CLS=hparams.CLS, SEP=hparams.SEP, PAD=hparams.PAD, PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR, max_len=hparams.max_len, min_len=hparams.min_len, cnn_filter_window_size=max(hparams.filter_window_sizes) if hparams.ftr_ext == 'cnn' else 0)) print("\n***** Evaluation on test set with best exported model: *****") for key in sorted(result.keys()): print("%s = %s" % (key, str(result[key])))
def create_optimizer(hparams, loss): """ Creates an optimizer training op. If the parameter lr_bert is specified, then use another adam for this learning rate. """ tvars = tf.trainable_variables() if hparams.use_horovod: import horovod.tensorflow as hvd # Log trainable variables (with local mode, or chief for ps strategy or rank 0 for hvd training) task_type = executor_utils.get_executor_task_type() if (hparams.use_horovod is False and task_type in [executor_utils.CHIEF, executor_utils.LOCAL_MODE]) or \ (hparams.use_horovod is True and hvd.rank() == 0): with tf.gfile.Open(path_join(hparams.out_dir, 'network_structure.txt'), 'w') as fout: fout.write("# Trainable variables\n") total_deep_params = 0 total_params = 0 for param in tvars: psize = 1 for s in param.get_shape(): psize *= s total_params += psize if param.name.startswith(hparams.ftr_ext): total_deep_params += psize fout.write( " %s, %s, %s\n" % (param.name, str(param.get_shape()), param.op.device)) fout.write('\n') fout.write('# Total trainable params: {}\n'.format(total_params)) fout.write( '# Out of the total trainable params, the total {} parameters: {}\n' .format(hparams.ftr_ext, total_deep_params)) # Define optimizer parameters init_lr = hparams.learning_rate num_train_steps = hparams.num_train_steps num_warmup_steps = hparams.num_warmup_steps lr_bert = hparams.lr_bert global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) if hparams.optimizer.startswith("bert_"): # Using optimizer with bert's implementation # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) name_2_optimizer = { 'bert_adam': AdamWeightDecayOptimizer, 'bert_lamb': LAMBOptimizer } OptimizerFunc = name_2_optimizer[hparams.optimizer] # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam/Lamb m/v variables are NOT # loaded from init_checkpoint.) optimizer = OptimizerFunc( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if hparams.use_horovod: # Horovod's distributed optimizer handles allreduce calls, synchronous only optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True) grads_and_vars = optimizer.compute_gradients(loss, tvars) grads = [grad for grad, var in grads_and_vars] tvars = [var for grad, var in grads_and_vars] else: grads = tf.gradients(loss, tvars) grads, grad_norm = tf.clip_by_global_norm(grads, clip_norm=1.0) if lr_bert is None: # If not a separate learning rate for bert (lr_bert) is specified, # all components use the same learning rate train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) else: # the BERT components will use another learning rate optimizer_bert = OptimizerFunc( learning_rate=learning_rate * lr_bert / init_lr, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if hparams.use_horovod: # Treat the bert optimizer the same as the original optimizer: wrapped with horovod optimizer_bert = hvd.DistributedOptimizer(optimizer_bert, sparse_as_dense=True) bert_grad, bert_tvars = [], [] other_grad, other_tvars = [], [] for grad, tvar in zip(grads, tvars): if tvar is not None and grad is not None: if tvar.name.startswith('bert'): bert_grad.append(grad) bert_tvars.append(tvar) print('****bert param:', tvar.name) else: other_grad.append(grad) other_tvars.append(tvar) print('****other param:', tvar.name) print('--------------\n', '# of bert', len(bert_grad), '# of other', len(other_grad), '\n--------------') bert_train_op = optimizer_bert.apply_gradients( zip(bert_grad, bert_tvars), global_step=global_step) other_train_op = optimizer.apply_gradients(zip( other_grad, other_tvars), global_step=global_step) new_global_step = global_step + 1 train_op = tf.group(bert_train_op, other_train_op, [global_step.assign(new_global_step)]) return train_op, grad_norm, learning_rate elif hparams.optimizer == "sgd": opt = tf.train.GradientDescentOptimizer(learning_rate) elif hparams.optimizer == "adam": opt = tf.train.AdamOptimizer(learning_rate) else: raise ValueError("Only support sgd/adam/bert_adam as optimizer option") # Gradients gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=True) clipped_gradients, grad_norm = tf.clip_by_global_norm( gradients, hparams.max_gradient_norm) train_op = opt.apply_gradients(zip(clipped_gradients, tvars), global_step=global_step) return train_op, grad_norm, learning_rate