Example #1
0
def main(argv):
    """
    This is the main method for training the model.
    :param argv: training parameters
    :return:
    """
    # Get executor task type from TF_CONFIG
    task_type = executor_utils.get_executor_task_type()

    # Get hyper-parameters.
    hparams = get_hparams(argv)

    tf.logging.set_verbosity(tf.logging.INFO)

    # if epoch is set, overwrite training steps
    if hparams.num_epochs is not None:
        hparams.num_train_steps = misc_utils.estimate_train_steps(
            hparams.train_file,
            hparams.num_epochs,
            hparams.train_batch_size,
            hparams.metadata_path is None)

    # Create directory and launch tensorboard
    if task_type == executor_utils.CHIEF or task_type == executor_utils.LOCAL_MODE:
        # If not resume training from checkpoints, delete output directory.
        if not hparams.resume_training:
            logging.info("Removing previous output directory...")
            if tf.gfile.Exists(hparams.out_dir):
                tf.gfile.DeleteRecursively(hparams.out_dir)

        # If output directory deleted or does not exist, create the directory.
        if not tf.gfile.Exists(hparams.out_dir):
            logging.info('Creating dirs recursively at: {0}'.format(hparams.out_dir))
            tf.gfile.MakeDirs(hparams.out_dir)

        misc_utils.save_hparams(hparams.out_dir, hparams)

        # set up logger
        sys.stdout = logger.Logger(os.path.join(hparams.out_dir, 'logging.txt'))
    else:
        # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code.
        logging.info("Waiting for chief to remove/create directories.")
        # Wait for dir created form chief
        time.sleep(10)

    if task_type == executor_utils.EVALUATOR:
        # set up logger for evaluator
        sys.stdout = logger.Logger(os.path.join(hparams.out_dir, 'eval_log.txt'))

    hparams = misc_utils.extend_hparams(hparams)

    logging.info("***********DeText Training***********")

    # Train and evaluate DeText model
    train.train(hparams)
Example #2
0
def run_detext(argument):
    """ Launches DeText training program"""
    logging.set_verbosity(logging.INFO)
    logging.info(f"Args:\n {argument}")

    hparams = parsing_utils.HParams(**asdict(argument))

    strategy = distribution_utils.get_distribution_strategy(hparams.distribution_strategy, num_gpus=hparams.num_gpu, all_reduce_alg=hparams.all_reduce_alg)
    logging.info(f"***********Num replica: {strategy.num_replicas_in_sync}***********")
    create_output_dir(hparams.resume_training, hparams.out_dir, strategy)
    save_hparams(hparams.out_dir, hparams, strategy)

    logging.info("***********DeText Training***********")
    train.train(strategy, hparams)
Example #3
0
def run_detext(argument: DetextArg):
    logging.info(f"Args:\n {argument}")

    tf.logging.set_verbosity(tf.logging.INFO)

    # For data sharding when using horovod
    if argument.use_horovod:
        import horovod.tensorflow as hvd
    else:
        hvd = None

    # Get executor task type from TF_CONFIG
    task_type = get_executor_task_type()

    # Create directory and launch tensorboard
    master = not hvd or hvd.rank() == 0
    if (task_type == CHIEF or task_type == LOCAL_MODE) and master:
        # If not resume training from checkpoints, delete output directory.
        if not argument.resume_training:
            logging.info("Removing previous output directory...")
            if tf.gfile.Exists(argument.out_dir):
                tf.gfile.DeleteRecursively(argument.out_dir)

        # If output directory deleted or does not exist, create the directory.
        if not tf.gfile.Exists(argument.out_dir):
            logging.info('Creating dirs recursively at: {0}'.format(
                argument.out_dir))
            tf.gfile.MakeDirs(argument.out_dir)

        misc_utils.save_hparams(argument.out_dir,
                                HParams(**argument._asdict()))

    else:
        # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code.
        logging.info("Waiting for chief to remove/create directories.")
        # Wait for dir created form chief
        time.sleep(10)

    if (task_type == EVALUATOR or task_type == LOCAL_MODE) and master:
        # set up logger for evaluator
        sys.stdout = logger.Logger(
            os.path.join(argument.out_dir, 'eval_log.txt'))

    hparams = misc_utils.extend_hparams(HParams(**argument._asdict()))

    logging.info("***********DeText Training***********")

    # Train and evaluate DeText model
    train.train(hparams, input_fn)
Example #4
0
def main(_):
    """
    This is the main method for training the model.
    :param argv: training parameters
    :return:
    """
    # Get executor task type from TF_CONFIG
    task_type = executor_utils.get_executor_task_type()

    # Get hyper-parameters.
    hparams = get_hparams()

    tf.logging.set_verbosity(tf.logging.INFO)

    # if epoch is set, overwrite training steps
    if hparams.num_epochs is not None:
        hparams.num_train_steps = misc_utils.estimate_train_steps(
            hparams.train_file, hparams.num_epochs, hparams.train_batch_size,
            hparams.metadata_path is None)

    # if num_eval_rounds is set, override steps_per_eval
    if hparams.num_eval_rounds is not None:
        hparams.steps_per_eval = max(
            1, int(hparams.num_train_steps / hparams.num_eval_rounds))

    # For data sharding when using horovod
    hparams.add_hparam("hvd_info", None)
    if hparams.use_horovod:
        import horovod.tensorflow as hvd
        hvd.init()
        hparams.num_train_steps = hparams.num_train_steps // hvd.size()
        hparams.num_warmup_steps = hparams.num_warmup_steps // hvd.size()
        hparams.steps_per_eval = hparams.steps_per_eval // hvd.size()
        hparams.steps_per_stats = hparams.steps_per_stats // hvd.size()
        hparams.hvd_info = {'rank': hvd.rank(), 'size': hvd.size()}

    # Create directory and launch tensorboard
    if ((task_type == executor_utils.CHIEF
         or task_type == executor_utils.LOCAL_MODE)
            and (not hparams.use_horovod or
                 (hparams.use_horovod and hvd.rank() == 0))):
        # If not resume training from checkpoints, delete output directory.
        if not hparams.resume_training:
            logging.info("Removing previous output directory...")
            if tf.gfile.Exists(hparams.out_dir):
                tf.gfile.DeleteRecursively(hparams.out_dir)

        # If output directory deleted or does not exist, create the directory.
        if not tf.gfile.Exists(hparams.out_dir):
            logging.info('Creating dirs recursively at: {0}'.format(
                hparams.out_dir))
            tf.gfile.MakeDirs(hparams.out_dir)

        misc_utils.save_hparams(hparams.out_dir, hparams)

    else:
        # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code.
        logging.info("Waiting for chief to remove/create directories.")
        # Wait for dir created form chief
        time.sleep(10)

    if ((task_type == executor_utils.EVALUATOR
         or task_type == executor_utils.LOCAL_MODE)
            and (not hparams.use_horovod or
                 (hparams.use_horovod and hvd.rank() == 0))):
        # set up logger for evaluator
        sys.stdout = logger.Logger(
            os.path.join(hparams.out_dir, 'eval_log.txt'))

    hparams = misc_utils.extend_hparams(hparams)

    logging.info("***********DeText Training***********")

    # Train and evaluate DeText model
    train.train(hparams, input_fn)