Esempio n. 1
0
def main(_):
    learning_rate = FLAGS.learning_rate
    # Horovod import successful
    if is_mpi:
        FLAGS.output_dir = FLAGS.output_dir if hvd.rank() == 0 else \
            os.path.join(FLAGS.output_dir, str(hvd.rank()))
        # Horovod: adjust number of steps based on number of CPUs.
        FLAGS.num_train_steps = FLAGS.num_train_steps // hvd.size()
        FLAGS.num_warmup_steps = FLAGS.num_warmup_steps // hvd.size()
        learning_rate = learning_rate * math.sqrt(hvd.size())

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    #tf.compat.v1.disable_eager_execution()

    #if FLAGS.train_batch_size >32 :
    #  if FLAGS.train_batch_size % 32 != 0 :
    #    raise ValueError("If Batch size is > 32 it should be divisible by 32")
    #  else :
    #    FLAGS.accum_steps = int(FLAGS.train_batch_size/32)
    #    FLAGS.train_batch_size  = 32

    logBatchSizeInfo(FLAGS)

    if FLAGS.disable_v2_bevior:
        tf.compat.v1.disable_v2_behavior()

    if FLAGS.profile:
        tf.compat.v1.disable_eager_execution()

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    bert_config.set_additional_options(FLAGS.precision,
                                       FLAGS.experimental_gelu)

    tf.io.gfile.makedirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.io.gfile.glob(input_pattern))

    tf.compat.v1.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.compat.v1.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    session_config = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
        intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads,
        allow_soft_placement=True)
    run_config = tf.compat.v1.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        session_config=session_config,
        tpu_config=tf.compat.v1.estimator.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host),
        log_step_count_steps=25)

    if bert_config.precision == "bfloat16":
        tf.compat.v1.logging.info("INFO: BERT bfloat16 training....!")
    else:
        tf.compat.v1.logging.info("INFO: BERT fp32 training....!")

    if bert_config.precision == "bfloat16":
        with tf.compat.v1.tpu.bfloat16_scope():
            bf.set_rprecision(tf.bfloat16)
            print("INFO: BERT bfloat16 training....!")
            model_fn = model_fn_builder(
                bert_config=bert_config,
                init_checkpoint=FLAGS.init_checkpoint,
                learning_rate=learning_rate,
                num_train_steps=FLAGS.num_train_steps,
                num_warmup_steps=FLAGS.num_warmup_steps,
                accum_steps=FLAGS.accum_steps,
                use_tpu=FLAGS.use_tpu,
                use_one_hot_embeddings=FLAGS.use_tpu,
                use_multi_cpu=is_mpi)
    else:
        print("INFO: BERT fp32 training....!")
        model_fn = model_fn_builder(bert_config=bert_config,
                                    init_checkpoint=FLAGS.init_checkpoint,
                                    learning_rate=learning_rate,
                                    num_train_steps=FLAGS.num_train_steps,
                                    num_warmup_steps=FLAGS.num_warmup_steps,
                                    accum_steps=FLAGS.accum_steps,
                                    use_tpu=FLAGS.use_tpu,
                                    use_one_hot_embeddings=FLAGS.use_tpu,
                                    use_multi_cpu=is_mpi)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size * FLAGS.accum_steps,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            batch_size=FLAGS.train_batch_size,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True)
        # Horovod: In the case of multi CPU training with Horovod, adding
        # hvd.BroadcastGlobalVariablesHook(0) hook, broadcasts the initial variable
        # states from rank 0 to all other processes. This is necessary to ensure
        # consistent initialization of all workers when training is started with
        # random weights or restored from a checkpoint.
        if is_mpi:
            hooks = [hvd.BroadcastGlobalVariablesHook(0)]
        else:
            hooks = []

        if FLAGS.profile:
            tf.compat.v1.logging.info(
                "***** Running training with profiler*****")
            hooks.append(
                tf.compat.v1.train.ProfilerHook(save_steps=3,
                                                output_dir=FLAGS.output_dir,
                                                show_memory=False))

        estimator.train(input_fn=train_input_fn,
                        max_steps=FLAGS.num_train_steps,
                        hooks=hooks)

    if FLAGS.do_eval:
        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            max_seq_length=FLAGS.max_seq_length,
            batch_size=FLAGS.train_batch_size,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False)

        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.compat.v1.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.compat.v1.logging.info("  name = %s, shape = %s" %
                                      (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        masked_lm_positions = features["masked_lm_positions"]
        masked_lm_ids = features["masked_lm_ids"]
        masked_lm_weights = features["masked_lm_weights"]
        next_sentence_labels = features["next_sentence_labels"]

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        opt_fine_tuning = True
        if bert_config.precision == "bfloat16":
            opt_fine_tuning = False

        if bert_config.precision == "bfloat16" and bert_config.new_bf16_scope == True:
            with tf.compat.v1.tpu.bfloat16_scope():
                bf.set_rprecision(tf.bfloat16)
                opt_fine_tuning = True
                tf.compat.v1.logging.info("*** New bfloat16 scope set***")
                model = modeling.BertModel(
                    config=bert_config,
                    is_training=is_training,
                    input_ids=input_ids,
                    input_mask=input_mask,
                    token_type_ids=segment_ids,
                    use_one_hot_embeddings=use_one_hot_embeddings)
        else:
            model = modeling.BertModel(
                config=bert_config,
                is_training=is_training,
                input_ids=input_ids,
                input_mask=input_mask,
                token_type_ids=segment_ids,
                use_one_hot_embeddings=use_one_hot_embeddings)

        (masked_lm_loss, masked_lm_example_loss,
         masked_lm_log_probs) = get_masked_lm_output(
             bert_config, model.get_sequence_output(),
             model.get_embedding_table(), masked_lm_positions, masked_lm_ids,
             masked_lm_weights)

        (next_sentence_loss, next_sentence_example_loss,
         next_sentence_log_probs) = get_next_sentence_output(
             bert_config, model.get_pooled_output(), next_sentence_labels)

        total_loss = masked_lm_loss + next_sentence_loss

        tvars = tf.compat.v1.trainable_variables()

        initialized_variable_names = {}
        scaffold_fn = None
        if init_checkpoint:
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, init_checkpoint)
            if use_tpu:

                def tpu_scaffold():
                    tf.compat.v1.train.init_from_checkpoint(
                        init_checkpoint, assignment_map)
                    return tf.compat.v1.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                tf.compat.v1.train.init_from_checkpoint(
                    init_checkpoint, assignment_map)

        tf.compat.v1.logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in initialized_variable_names:
                init_string = ", *INIT_FROM_CKPT*"
            tf.compat.v1.logging.info("  name = %s, shape = %s%s", var.name,
                                      var.shape, init_string)

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            train_op = optimization.create_optimizer(
                total_loss,
                learning_rate,
                num_train_steps,
                num_warmup_steps,
                accum_steps,
                use_tpu,
                fine_tuning=opt_fine_tuning)

            log_hook = bf.logTheLossHook(total_loss, FLAGS.accum_steps * 3)
            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                training_hooks=[log_hook],
                scaffold_fn=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(masked_lm_example_loss, masked_lm_log_probs,
                          masked_lm_ids, masked_lm_weights,
                          next_sentence_example_loss, next_sentence_log_probs,
                          next_sentence_labels):
                """Computes the loss and accuracy of the model."""
                masked_lm_log_probs = tf.reshape(
                    masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]])
                masked_lm_predictions = tf.argmax(input=masked_lm_log_probs,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_example_loss = tf.reshape(masked_lm_example_loss,
                                                    [-1])
                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                masked_lm_accuracy = tf.compat.v1.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.compat.v1.metrics.mean(
                    values=masked_lm_example_loss, weights=masked_lm_weights)

                next_sentence_log_probs = tf.reshape(
                    next_sentence_log_probs,
                    [-1, next_sentence_log_probs.shape[-1]])
                next_sentence_predictions = tf.argmax(
                    input=next_sentence_log_probs,
                    axis=-1,
                    output_type=tf.int32)
                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
                next_sentence_accuracy = tf.compat.v1.metrics.accuracy(
                    labels=next_sentence_labels,
                    predictions=next_sentence_predictions)
                next_sentence_mean_loss = tf.compat.v1.metrics.mean(
                    values=next_sentence_example_loss)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss,
                    "next_sentence_accuracy": next_sentence_accuracy,
                    "next_sentence_loss": next_sentence_mean_loss,
                }

            eval_metrics = (metric_fn, [
                masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
                masked_lm_weights, next_sentence_example_loss,
                next_sentence_log_probs, next_sentence_labels
            ])
            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                eval_metrics=eval_metrics,
                scaffold_fn=scaffold_fn)
        else:
            raise ValueError("Only TRAIN and EVAL modes are supported: %s" %
                             (mode))

        return output_spec
def main(_):
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

    #if FLAGS.train_batch_size >32 :
    #  if FLAGS.train_batch_size % 32 != 0 :
    #    raise ValueError("If Batch size is > 32 it should be divisible by 32")
    #  else :
    #    FLAGS.accum_steps = int(FLAGS.train_batch_size/32)
    #    FLAGS.train_batch_size  = 32

    logBatchSizeInfo(FLAGS)
    if FLAGS.profile:
        tf.compat.v1.disable_eager_execution()

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    if FLAGS.precision:
        bert_config.precision = FLAGS.precision

    bert_config.new_bf16_scope = FLAGS.new_bf16_scope

    tf.io.gfile.makedirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.io.gfile.glob(input_pattern))

    tf.compat.v1.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.compat.v1.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    session_config = tf.compat.v1.ConfigProto(
        inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
        intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads,
        allow_soft_placement=True)
    #print ("INTRA OPS :", FLAGS.intra_op_parallelism_threads)
    #print ("INTER OPS :", FLAGS.inter_op_parallelism_threads)

    is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.compat.v1.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        log_step_count_steps=100,
        session_config=session_config,
        tpu_config=tf.compat.v1.estimator.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    if bert_config.precision == "bfloat16":
        tf.compat.v1.logging.info("INFO: BERT bfloat16 training....!")
    else:
        tf.compat.v1.logging.info("INFO: BERT fp32 training....!")

    if bert_config.precision == "bfloat16" and bert_config.new_bf16_scope == False:
        with tf.compat.v1.tpu.bfloat16_scope():
            bf.set_rprecision(tf.bfloat16)
            tf.compat.v1.logging.info("*** Old bfloat16 scope set***")
            model_fn = model_fn_builder(
                bert_config=bert_config,
                init_checkpoint=FLAGS.init_checkpoint,
                learning_rate=FLAGS.learning_rate,
                num_train_steps=FLAGS.num_train_steps,
                num_warmup_steps=FLAGS.num_warmup_steps,
                accum_steps=FLAGS.accum_steps,
                use_tpu=FLAGS.use_tpu,
                use_one_hot_embeddings=FLAGS.use_tpu)
    else:
        model_fn = model_fn_builder(bert_config=bert_config,
                                    init_checkpoint=FLAGS.init_checkpoint,
                                    learning_rate=FLAGS.learning_rate,
                                    num_train_steps=FLAGS.num_train_steps,
                                    num_warmup_steps=FLAGS.num_warmup_steps,
                                    accum_steps=FLAGS.accum_steps,
                                    use_tpu=FLAGS.use_tpu,
                                    use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size * FLAGS.accum_steps,
        eval_batch_size=FLAGS.eval_batch_size)

    if FLAGS.do_train:
        tf.compat.v1.logging.info("***** Running training *****")
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        train_input_fn = input_fn_builder(
            input_files=input_files,
            batch_size=FLAGS.train_batch_size,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=True)

        if FLAGS.profile:
            tf.compat.v1.logging.info(
                "***** Running training with profiler*****")
            hooks = [
                tf.compat.v1.train.ProfilerHook(save_steps=3,
                                                output_dir=FLAGS.output_dir,
                                                show_memory=True)
            ]
            estimator.train(input_fn=train_input_fn,
                            max_steps=FLAGS.num_train_steps,
                            hooks=hooks)
        else:
            estimator.train(input_fn=train_input_fn,
                            max_steps=FLAGS.num_train_steps)

    if FLAGS.do_eval:
        tf.compat.v1.logging.info("***** Running evaluation *****")
        tf.compat.v1.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_input_fn = input_fn_builder(
            input_files=input_files,
            batch_size=FLAGS.train_batch_size,
            max_seq_length=FLAGS.max_seq_length,
            max_predictions_per_seq=FLAGS.max_predictions_per_seq,
            is_training=False)

        result = estimator.evaluate(input_fn=eval_input_fn,
                                    steps=FLAGS.max_eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.io.gfile.GFile(output_eval_file, "w") as writer:
            tf.compat.v1.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.compat.v1.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Esempio n. 4
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=False,
                 scope=None):
        """Constructor for BertModel.

    Args:
      config: `BertConfig` instance.
      is_training: bool. true for training model, false for eval model. Controls
        whether dropout will be applied.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      input_mask: (optional) int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      use_one_hot_embeddings: (optional) bool. Whether to use one-hot word
        embeddings or tf.embedding_lookup() for the word embeddings.
      scope: (optional) variable scope. Defaults to "bert".

    Raises:
      ValueError: The config is invalid or one of the input tensor shapes
        is invalid.
    """
        if config.precision == "bfloat16":
            bf.set_rprecision(tf.bfloat16)

        bf.set_global_flags(config.experimental_gelu)

        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_dropout_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                      dtype=tf.int32)

        with tf.compat.v1.variable_scope(scope, default_name="bert"):
            with tf.compat.v1.variable_scope("embeddings"):
                # Perform embedding lookup on the word ids.
                (self.embedding_output,
                 self.embedding_table) = embedding_lookup(
                     input_ids=input_ids,
                     vocab_size=config.vocab_size,
                     embedding_size=config.hidden_size,
                     initializer_range=config.initializer_range,
                     word_embedding_name="word_embeddings",
                     use_one_hot_embeddings=use_one_hot_embeddings)

                # Add positional embeddings and token type embeddings, then layer
                # normalize and perform dropout.
                self.embedding_output = embedding_postprocessor(
                    input_tensor=self.embedding_output,
                    use_token_type=True,
                    token_type_ids=token_type_ids,
                    token_type_vocab_size=config.type_vocab_size,
                    token_type_embedding_name="token_type_embeddings",
                    use_position_embeddings=True,
                    position_embedding_name="position_embeddings",
                    initializer_range=config.initializer_range,
                    max_position_embeddings=config.max_position_embeddings,
                    dropout_prob=config.hidden_dropout_prob)

            with tf.compat.v1.variable_scope("encoder"):
                # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
                # mask of shape [batch_size, seq_length, seq_length] which is used
                # for the attention scores.
                attention_mask = create_attention_mask_from_input_mask(
                    input_ids, input_mask)

                # Run the stacked transformer.
                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
                # Cast is used to cover bfloat16
                input_tensor = bf.r_cast(self.embedding_output)
                self.all_encoder_layers = transformer_model(
                    input_tensor=input_tensor,
                    attention_mask=attention_mask,
                    hidden_size=config.hidden_size,
                    num_hidden_layers=config.num_hidden_layers,
                    num_attention_heads=config.num_attention_heads,
                    intermediate_size=config.intermediate_size,
                    intermediate_act_fn=get_activation(config.hidden_act),
                    hidden_dropout_prob=config.hidden_dropout_prob,
                    attention_probs_dropout_prob=config.
                    attention_probs_dropout_prob,
                    initializer_range=config.initializer_range,
                    do_return_all_layers=True)

            self.sequence_output = self.all_encoder_layers[-1]
            # The "pooler" converts the encoded sequence tensor of shape
            # [batch_size, seq_length, hidden_size] to a tensor of shape
            # [batch_size, hidden_size]. This is necessary for segment-level
            # (or segment-pair-level) classification tasks where we need a fixed
            # dimensional representation of the segment.
            with tf.compat.v1.variable_scope("pooler"):
                # We "pool" the model by simply taking the hidden state corresponding
                # to the first token. We assume that this has been pre-trained
                first_token_tensor = tf.squeeze(self.sequence_output[:,
                                                                     0:1, :],
                                                axis=1)
                self.pooled_output = tf.compat.v1.layers.dense(
                    first_token_tensor,
                    config.hidden_size,
                    activation=bf.tanh,
                    kernel_initializer=create_initializer(
                        config.initializer_range))