def main(argv): """ This is the main method for training the model. :param argv: training parameters :return: """ # Get executor task type from TF_CONFIG task_type = executor_utils.get_executor_task_type() # Get hyper-parameters. hparams = get_hparams(argv) tf.logging.set_verbosity(tf.logging.INFO) # if epoch is set, overwrite training steps if hparams.num_epochs is not None: hparams.num_train_steps = misc_utils.estimate_train_steps( hparams.train_file, hparams.num_epochs, hparams.train_batch_size, hparams.metadata_path is None) # Create directory and launch tensorboard if task_type == executor_utils.CHIEF or task_type == executor_utils.LOCAL_MODE: # If not resume training from checkpoints, delete output directory. if not hparams.resume_training: logging.info("Removing previous output directory...") if tf.gfile.Exists(hparams.out_dir): tf.gfile.DeleteRecursively(hparams.out_dir) # If output directory deleted or does not exist, create the directory. if not tf.gfile.Exists(hparams.out_dir): logging.info('Creating dirs recursively at: {0}'.format(hparams.out_dir)) tf.gfile.MakeDirs(hparams.out_dir) misc_utils.save_hparams(hparams.out_dir, hparams) # set up logger sys.stdout = logger.Logger(os.path.join(hparams.out_dir, 'logging.txt')) else: # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code. logging.info("Waiting for chief to remove/create directories.") # Wait for dir created form chief time.sleep(10) if task_type == executor_utils.EVALUATOR: # set up logger for evaluator sys.stdout = logger.Logger(os.path.join(hparams.out_dir, 'eval_log.txt')) hparams = misc_utils.extend_hparams(hparams) logging.info("***********DeText Training***********") # Train and evaluate DeText model train.train(hparams)
def run_detext(argument): """ Launches DeText training program""" logging.set_verbosity(logging.INFO) logging.info(f"Args:\n {argument}") hparams = parsing_utils.HParams(**asdict(argument)) strategy = distribution_utils.get_distribution_strategy(hparams.distribution_strategy, num_gpus=hparams.num_gpu, all_reduce_alg=hparams.all_reduce_alg) logging.info(f"***********Num replica: {strategy.num_replicas_in_sync}***********") create_output_dir(hparams.resume_training, hparams.out_dir, strategy) save_hparams(hparams.out_dir, hparams, strategy) logging.info("***********DeText Training***********") train.train(strategy, hparams)
def run_detext(argument: DetextArg): logging.info(f"Args:\n {argument}") tf.logging.set_verbosity(tf.logging.INFO) # For data sharding when using horovod if argument.use_horovod: import horovod.tensorflow as hvd else: hvd = None # Get executor task type from TF_CONFIG task_type = get_executor_task_type() # Create directory and launch tensorboard master = not hvd or hvd.rank() == 0 if (task_type == CHIEF or task_type == LOCAL_MODE) and master: # If not resume training from checkpoints, delete output directory. if not argument.resume_training: logging.info("Removing previous output directory...") if tf.gfile.Exists(argument.out_dir): tf.gfile.DeleteRecursively(argument.out_dir) # If output directory deleted or does not exist, create the directory. if not tf.gfile.Exists(argument.out_dir): logging.info('Creating dirs recursively at: {0}'.format( argument.out_dir)) tf.gfile.MakeDirs(argument.out_dir) misc_utils.save_hparams(argument.out_dir, HParams(**argument._asdict())) else: # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code. logging.info("Waiting for chief to remove/create directories.") # Wait for dir created form chief time.sleep(10) if (task_type == EVALUATOR or task_type == LOCAL_MODE) and master: # set up logger for evaluator sys.stdout = logger.Logger( os.path.join(argument.out_dir, 'eval_log.txt')) hparams = misc_utils.extend_hparams(HParams(**argument._asdict())) logging.info("***********DeText Training***********") # Train and evaluate DeText model train.train(hparams, input_fn)
def main(_): """ This is the main method for training the model. :param argv: training parameters :return: """ # Get executor task type from TF_CONFIG task_type = executor_utils.get_executor_task_type() # Get hyper-parameters. hparams = get_hparams() tf.logging.set_verbosity(tf.logging.INFO) # if epoch is set, overwrite training steps if hparams.num_epochs is not None: hparams.num_train_steps = misc_utils.estimate_train_steps( hparams.train_file, hparams.num_epochs, hparams.train_batch_size, hparams.metadata_path is None) # if num_eval_rounds is set, override steps_per_eval if hparams.num_eval_rounds is not None: hparams.steps_per_eval = max( 1, int(hparams.num_train_steps / hparams.num_eval_rounds)) # For data sharding when using horovod hparams.add_hparam("hvd_info", None) if hparams.use_horovod: import horovod.tensorflow as hvd hvd.init() hparams.num_train_steps = hparams.num_train_steps // hvd.size() hparams.num_warmup_steps = hparams.num_warmup_steps // hvd.size() hparams.steps_per_eval = hparams.steps_per_eval // hvd.size() hparams.steps_per_stats = hparams.steps_per_stats // hvd.size() hparams.hvd_info = {'rank': hvd.rank(), 'size': hvd.size()} # Create directory and launch tensorboard if ((task_type == executor_utils.CHIEF or task_type == executor_utils.LOCAL_MODE) and (not hparams.use_horovod or (hparams.use_horovod and hvd.rank() == 0))): # If not resume training from checkpoints, delete output directory. if not hparams.resume_training: logging.info("Removing previous output directory...") if tf.gfile.Exists(hparams.out_dir): tf.gfile.DeleteRecursively(hparams.out_dir) # If output directory deleted or does not exist, create the directory. if not tf.gfile.Exists(hparams.out_dir): logging.info('Creating dirs recursively at: {0}'.format( hparams.out_dir)) tf.gfile.MakeDirs(hparams.out_dir) misc_utils.save_hparams(hparams.out_dir, hparams) else: # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code. logging.info("Waiting for chief to remove/create directories.") # Wait for dir created form chief time.sleep(10) if ((task_type == executor_utils.EVALUATOR or task_type == executor_utils.LOCAL_MODE) and (not hparams.use_horovod or (hparams.use_horovod and hvd.rank() == 0))): # set up logger for evaluator sys.stdout = logger.Logger( os.path.join(hparams.out_dir, 'eval_log.txt')) hparams = misc_utils.extend_hparams(hparams) logging.info("***********DeText Training***********") # Train and evaluate DeText model train.train(hparams, input_fn)