Esempio n. 1
0
def main(argv):
    """
    This is the main method for training the model.
    :param argv: training parameters
    :return:
    """
    # Get executor task type from TF_CONFIG
    task_type = executor_utils.get_executor_task_type()

    # Get hyper-parameters.
    hparams = get_hparams(argv)

    tf.logging.set_verbosity(tf.logging.INFO)

    # if epoch is set, overwrite training steps
    if hparams.num_epochs is not None:
        hparams.num_train_steps = misc_utils.estimate_train_steps(
            hparams.train_file,
            hparams.num_epochs,
            hparams.train_batch_size,
            hparams.metadata_path is None)

    # Create directory and launch tensorboard
    if task_type == executor_utils.CHIEF or task_type == executor_utils.LOCAL_MODE:
        # If not resume training from checkpoints, delete output directory.
        if not hparams.resume_training:
            logging.info("Removing previous output directory...")
            if tf.gfile.Exists(hparams.out_dir):
                tf.gfile.DeleteRecursively(hparams.out_dir)

        # If output directory deleted or does not exist, create the directory.
        if not tf.gfile.Exists(hparams.out_dir):
            logging.info('Creating dirs recursively at: {0}'.format(hparams.out_dir))
            tf.gfile.MakeDirs(hparams.out_dir)

        misc_utils.save_hparams(hparams.out_dir, hparams)

        # set up logger
        sys.stdout = logger.Logger(os.path.join(hparams.out_dir, 'logging.txt'))
    else:
        # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code.
        logging.info("Waiting for chief to remove/create directories.")
        # Wait for dir created form chief
        time.sleep(10)

    if task_type == executor_utils.EVALUATOR:
        # set up logger for evaluator
        sys.stdout = logger.Logger(os.path.join(hparams.out_dir, 'eval_log.txt'))

    hparams = misc_utils.extend_hparams(hparams)

    logging.info("***********DeText Training***********")

    # Train and evaluate DeText model
    train.train(hparams)
Esempio n. 2
0
def run_detext(argument: DetextArg):
    logging.info(f"Args:\n {argument}")

    tf.logging.set_verbosity(tf.logging.INFO)

    # For data sharding when using horovod
    if argument.use_horovod:
        import horovod.tensorflow as hvd
    else:
        hvd = None

    # Get executor task type from TF_CONFIG
    task_type = get_executor_task_type()

    # Create directory and launch tensorboard
    master = not hvd or hvd.rank() == 0
    if (task_type == CHIEF or task_type == LOCAL_MODE) and master:
        # If not resume training from checkpoints, delete output directory.
        if not argument.resume_training:
            logging.info("Removing previous output directory...")
            if tf.gfile.Exists(argument.out_dir):
                tf.gfile.DeleteRecursively(argument.out_dir)

        # If output directory deleted or does not exist, create the directory.
        if not tf.gfile.Exists(argument.out_dir):
            logging.info('Creating dirs recursively at: {0}'.format(
                argument.out_dir))
            tf.gfile.MakeDirs(argument.out_dir)

        misc_utils.save_hparams(argument.out_dir,
                                HParams(**argument._asdict()))

    else:
        # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code.
        logging.info("Waiting for chief to remove/create directories.")
        # Wait for dir created form chief
        time.sleep(10)

    if (task_type == EVALUATOR or task_type == LOCAL_MODE) and master:
        # set up logger for evaluator
        sys.stdout = logger.Logger(
            os.path.join(argument.out_dir, 'eval_log.txt'))

    hparams = misc_utils.extend_hparams(HParams(**argument._asdict()))

    logging.info("***********DeText Training***********")

    # Train and evaluate DeText model
    train.train(hparams, input_fn)
Esempio n. 3
0
def main(_):
    """
    This is the main method for training the model.
    :param argv: training parameters
    :return:
    """
    # Get executor task type from TF_CONFIG
    task_type = executor_utils.get_executor_task_type()

    # Get hyper-parameters.
    hparams = get_hparams()

    tf.logging.set_verbosity(tf.logging.INFO)

    # if epoch is set, overwrite training steps
    if hparams.num_epochs is not None:
        hparams.num_train_steps = misc_utils.estimate_train_steps(
            hparams.train_file, hparams.num_epochs, hparams.train_batch_size,
            hparams.metadata_path is None)

    # if num_eval_rounds is set, override steps_per_eval
    if hparams.num_eval_rounds is not None:
        hparams.steps_per_eval = max(
            1, int(hparams.num_train_steps / hparams.num_eval_rounds))

    # For data sharding when using horovod
    hparams.add_hparam("hvd_info", None)
    if hparams.use_horovod:
        import horovod.tensorflow as hvd
        hvd.init()
        hparams.num_train_steps = hparams.num_train_steps // hvd.size()
        hparams.num_warmup_steps = hparams.num_warmup_steps // hvd.size()
        hparams.steps_per_eval = hparams.steps_per_eval // hvd.size()
        hparams.steps_per_stats = hparams.steps_per_stats // hvd.size()
        hparams.hvd_info = {'rank': hvd.rank(), 'size': hvd.size()}

    # Create directory and launch tensorboard
    if ((task_type == executor_utils.CHIEF
         or task_type == executor_utils.LOCAL_MODE)
            and (not hparams.use_horovod or
                 (hparams.use_horovod and hvd.rank() == 0))):
        # If not resume training from checkpoints, delete output directory.
        if not hparams.resume_training:
            logging.info("Removing previous output directory...")
            if tf.gfile.Exists(hparams.out_dir):
                tf.gfile.DeleteRecursively(hparams.out_dir)

        # If output directory deleted or does not exist, create the directory.
        if not tf.gfile.Exists(hparams.out_dir):
            logging.info('Creating dirs recursively at: {0}'.format(
                hparams.out_dir))
            tf.gfile.MakeDirs(hparams.out_dir)

        misc_utils.save_hparams(hparams.out_dir, hparams)

    else:
        # TODO: move removal/creation to a hadoopShellJob st. it does not reside in distributed training code.
        logging.info("Waiting for chief to remove/create directories.")
        # Wait for dir created form chief
        time.sleep(10)

    if ((task_type == executor_utils.EVALUATOR
         or task_type == executor_utils.LOCAL_MODE)
            and (not hparams.use_horovod or
                 (hparams.use_horovod and hvd.rank() == 0))):
        # set up logger for evaluator
        sys.stdout = logger.Logger(
            os.path.join(hparams.out_dir, 'eval_log.txt'))

    hparams = misc_utils.extend_hparams(hparams)

    logging.info("***********DeText Training***********")

    # Train and evaluate DeText model
    train.train(hparams, input_fn)
Esempio n. 4
0
def train(hparams, input_fn):
    """
    Main function for train/evaluate DeText ranking model
    :param hparams: hparams
    :param input_fn: input function to create train/eval specs
    :return:
    """
    eval_log_file = None
    if hparams.use_horovod is True:
        import horovod.tensorflow as hvd
        eval_log_file = path_join(hparams.out_dir, 'eval_log.txt')
    train_strategy = tf.contrib.distribute.ParameterServerStrategy()
    estimator = get_estimator(hparams, strategy=train_strategy)

    # Set model export config for evaluator or primary worker of horovod
    exporter_list = None
    if hparams.use_horovod is False or (hparams.use_horovod is True
                                        and hvd.rank() == 0):
        best_model_name = 'best_' + hparams.pmetric
        # Exporter to save best (in terms of pmetric) checkpoint in the folder [best_model_name],
        # and export to savedmodel for prediction.
        best_checkpoint_exporter = BestCheckpointCopier(
            name=best_model_name,
            serving_input_receiver_fn=lambda: serving_input_fn(hparams),
            checkpoints_to_keep=1,  # keeping the best checkpoint
            exports_to_keep=1,  # keeping the best savedmodel
            pmetric='metric/{}'.format(hparams.pmetric),
            compare_fn=lambda x, y: x.score > y.score,  # larger metric better
            sort_reverse=True,
            eval_log_file=eval_log_file)
        exporter_list = [best_checkpoint_exporter]

    # Handle sync distributed training case via use_horovod
    if hparams.use_horovod:
        import horovod.tensorflow as hvd

        # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states from
        # rank 0 to all other processes. This is necessary to ensure consistent
        # initialization of all workers when training is started with random weights or
        # restored from a checkpoint.
        bcast_hook = hvd.BroadcastGlobalVariablesHook(0)

    # Create TrainSpec for model training
    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: input_fn(
            input_pattern=hparams.train_file,
            metadata_path=hparams.metadata_path,
            batch_size=hparams.train_batch_size,
            mode=tf.estimator.ModeKeys.TRAIN,
            vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.
                                                  UNK),
            vocab_table_for_id_ftr=vocab_utils.read_tf_vocab(
                hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR),
            feature_names=hparams.feature_names,
            CLS=hparams.CLS,
            SEP=hparams.SEP,
            PAD=hparams.PAD,
            PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR,
            max_len=hparams.max_len,
            min_len=hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes)
            if hparams.ftr_ext == 'cnn' else 0,
            # Add horovod information if applicable
            hvd_info=hparams.hvd_info if hparams.use_horovod else None),
        hooks=[bcast_hook] if hparams.use_horovod else
        None,  # Ensure proper initialization with horovod
        max_steps=hparams.num_train_steps)

    eval_spec = tf.estimator.EvalSpec(
        input_fn=lambda: input_fn(
            input_pattern=hparams.dev_file,
            metadata_path=hparams.metadata_path,
            batch_size=hparams.test_batch_size,
            mode=tf.estimator.ModeKeys.EVAL,
            vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.
                                                  UNK),
            vocab_table_for_id_ftr=vocab_utils.read_tf_vocab(
                hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR),
            feature_names=hparams.feature_names,
            CLS=hparams.CLS,
            SEP=hparams.SEP,
            PAD=hparams.PAD,
            PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR,
            max_len=hparams.max_len,
            min_len=hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes)
            if hparams.ftr_ext == 'cnn' else 0),
        exporters=exporter_list,
        steps=None,
        # Set throttle_secs to 10 min to avoid warning to spam logs
        # Set throttle to 0 for horovod: https://github.com/horovod/horovod/issues/182#issuecomment-533897757
        throttle_secs=0 if hparams.use_horovod else 600,
        start_delay_secs=10)

    # Training and evaluation with dev set
    tf.estimator.train_and_evaluate(estimator=estimator,
                                    train_spec=train_spec,
                                    eval_spec=eval_spec)
    print("***** Training finished. *****")

    # Evaluation with test set: create an estimator with the best_checkpoint_dir to load the best model
    task_type = executor_utils.get_executor_task_type()
    do_evaluate = task_type == executor_utils.EVALUATOR or task_type == executor_utils.LOCAL_MODE
    if (not hparams.use_horovod and do_evaluate) or (hparams.use_horovod
                                                     and hvd.rank() == 0):
        best_checkpoint_dir = path_join(hparams.out_dir, best_model_name)
        estimator_savedmodel = get_estimator(
            hparams,
            strategy=train_strategy,
            best_checkpoint=best_checkpoint_dir)
        result = estimator_savedmodel.evaluate(input_fn=lambda: input_fn(
            input_pattern=hparams.test_file,
            metadata_path=hparams.metadata_path,
            batch_size=hparams.test_batch_size,
            mode=tf.estimator.ModeKeys.EVAL,
            vocab_table=vocab_utils.read_tf_vocab(hparams.vocab_file, hparams.
                                                  UNK),
            vocab_table_for_id_ftr=vocab_utils.read_tf_vocab(
                hparams.vocab_file_for_id_ftr, hparams.UNK_FOR_ID_FTR),
            feature_names=hparams.feature_names,
            CLS=hparams.CLS,
            SEP=hparams.SEP,
            PAD=hparams.PAD,
            PAD_FOR_ID_FTR=hparams.PAD_FOR_ID_FTR,
            max_len=hparams.max_len,
            min_len=hparams.min_len,
            cnn_filter_window_size=max(hparams.filter_window_sizes)
            if hparams.ftr_ext == 'cnn' else 0))
        print("\n***** Evaluation on test set with best exported model: *****")
        for key in sorted(result.keys()):
            print("%s = %s" % (key, str(result[key])))
Esempio n. 5
0
def create_optimizer(hparams, loss):
    """
    Creates an optimizer training op.
    If the parameter lr_bert is specified, then use another adam for this learning rate.
    """
    tvars = tf.trainable_variables()

    if hparams.use_horovod:
        import horovod.tensorflow as hvd

    # Log trainable variables (with local mode, or chief for ps strategy or rank 0 for hvd training)
    task_type = executor_utils.get_executor_task_type()
    if (hparams.use_horovod is False and task_type in [executor_utils.CHIEF, executor_utils.LOCAL_MODE]) or \
            (hparams.use_horovod is True and hvd.rank() == 0):
        with tf.gfile.Open(path_join(hparams.out_dir, 'network_structure.txt'),
                           'w') as fout:
            fout.write("# Trainable variables\n")
            total_deep_params = 0
            total_params = 0
            for param in tvars:
                psize = 1
                for s in param.get_shape():
                    psize *= s
                total_params += psize
                if param.name.startswith(hparams.ftr_ext):
                    total_deep_params += psize
                fout.write(
                    "  %s, %s, %s\n" %
                    (param.name, str(param.get_shape()), param.op.device))
            fout.write('\n')
            fout.write('# Total trainable params: {}\n'.format(total_params))
            fout.write(
                '# Out of the total trainable params, the total {} parameters: {}\n'
                .format(hparams.ftr_ext, total_deep_params))

    # Define optimizer parameters
    init_lr = hparams.learning_rate
    num_train_steps = hparams.num_train_steps
    num_warmup_steps = hparams.num_warmup_steps
    lr_bert = hparams.lr_bert

    global_step = tf.train.get_or_create_global_step()

    learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32)

    if hparams.optimizer.startswith("bert_"):
        # Using optimizer with bert's implementation
        # Implements linear decay of the learning rate.
        learning_rate = tf.train.polynomial_decay(learning_rate,
                                                  global_step,
                                                  num_train_steps,
                                                  end_learning_rate=0.0,
                                                  power=1.0,
                                                  cycle=False)

        # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
        # learning rate will be `global_step/num_warmup_steps * init_lr`.
        if num_warmup_steps:
            global_steps_int = tf.cast(global_step, tf.int32)
            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)

            global_steps_float = tf.cast(global_steps_int, tf.float32)
            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)

            warmup_percent_done = global_steps_float / warmup_steps_float
            warmup_learning_rate = init_lr * warmup_percent_done

            is_warmup = tf.cast(global_steps_int < warmup_steps_int,
                                tf.float32)
            learning_rate = ((1.0 - is_warmup) * learning_rate +
                             is_warmup * warmup_learning_rate)

        name_2_optimizer = {
            'bert_adam': AdamWeightDecayOptimizer,
            'bert_lamb': LAMBOptimizer
        }
        OptimizerFunc = name_2_optimizer[hparams.optimizer]

        # It is recommended that you use this optimizer for fine tuning, since this
        # is how the model was trained (note that the Adam/Lamb m/v variables are NOT
        # loaded from init_checkpoint.)
        optimizer = OptimizerFunc(
            learning_rate=learning_rate,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])

        if hparams.use_horovod:
            # Horovod's distributed optimizer handles allreduce calls, synchronous only
            optimizer = hvd.DistributedOptimizer(optimizer,
                                                 sparse_as_dense=True)
            grads_and_vars = optimizer.compute_gradients(loss, tvars)
            grads = [grad for grad, var in grads_and_vars]
            tvars = [var for grad, var in grads_and_vars]
        else:
            grads = tf.gradients(loss, tvars)

        grads, grad_norm = tf.clip_by_global_norm(grads, clip_norm=1.0)

        if lr_bert is None:
            # If not a separate learning rate for bert (lr_bert) is specified,
            # all components use the same learning rate
            train_op = optimizer.apply_gradients(zip(grads, tvars),
                                                 global_step=global_step)

            # Normally the global step update is done inside of `apply_gradients`.
            # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
            # a different optimizer, you should probably take this line out.
            new_global_step = global_step + 1
            train_op = tf.group(train_op,
                                [global_step.assign(new_global_step)])
        else:
            # the BERT components will use another learning rate
            optimizer_bert = OptimizerFunc(
                learning_rate=learning_rate * lr_bert / init_lr,
                weight_decay_rate=0.01,
                beta_1=0.9,
                beta_2=0.999,
                epsilon=1e-6,
                exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
            if hparams.use_horovod:
                # Treat the bert optimizer the same as the original optimizer: wrapped with horovod
                optimizer_bert = hvd.DistributedOptimizer(optimizer_bert,
                                                          sparse_as_dense=True)

            bert_grad, bert_tvars = [], []
            other_grad, other_tvars = [], []
            for grad, tvar in zip(grads, tvars):
                if tvar is not None and grad is not None:
                    if tvar.name.startswith('bert'):
                        bert_grad.append(grad)
                        bert_tvars.append(tvar)
                        print('****bert param:', tvar.name)
                    else:
                        other_grad.append(grad)
                        other_tvars.append(tvar)
                        print('****other param:', tvar.name)
            print('--------------\n', '# of bert', len(bert_grad),
                  '# of other', len(other_grad), '\n--------------')
            bert_train_op = optimizer_bert.apply_gradients(
                zip(bert_grad, bert_tvars), global_step=global_step)
            other_train_op = optimizer.apply_gradients(zip(
                other_grad, other_tvars),
                                                       global_step=global_step)

            new_global_step = global_step + 1
            train_op = tf.group(bert_train_op, other_train_op,
                                [global_step.assign(new_global_step)])

        return train_op, grad_norm, learning_rate

    elif hparams.optimizer == "sgd":
        opt = tf.train.GradientDescentOptimizer(learning_rate)
    elif hparams.optimizer == "adam":
        opt = tf.train.AdamOptimizer(learning_rate)
    else:
        raise ValueError("Only support sgd/adam/bert_adam as optimizer option")

    # Gradients
    gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=True)
    clipped_gradients, grad_norm = tf.clip_by_global_norm(
        gradients, hparams.max_gradient_norm)
    train_op = opt.apply_gradients(zip(clipped_gradients, tvars),
                                   global_step=global_step)

    return train_op, grad_norm, learning_rate