Esempio n. 1
0
def run_model(total_gpu_num):
    """Train model."""
    with epl.replicate(total_gpu_num):
        iterator = get_mock_iterator()
        images, labels = iterator.get_next()
        features = resnet_v1.resnet_v1_50(images,
                                          num_classes=None,
                                          is_training=True)[0]
        features = tf.squeeze(features, [1, 2])

    with epl.split(total_gpu_num):
        logits = tf.layers.dense(features, class_num)
        loss = tf.losses.sparse_softmax_cross_entropy(labels=labels,
                                                      logits=logits)

    global_step = tf.train.get_or_create_global_step()
    optimizer = tf.train.AdamOptimizer(learning_rate=0.9)
    train_op = optimizer.minimize(loss, global_step=global_step)

    hooks = [tf.train.StopAtStepHook(last_step=20)]
    with tf.train.MonitoredTrainingSession(hooks=hooks) as sess:
        while not sess.should_stop():
            starttime = time.time()
            _, _, step = sess.run([loss, train_op, global_step])
            endtime = time.time()
            tf.logging.info("[Iteration {} ], Time: {:.4} .".format(
                step, endtime - starttime))
    tf.logging.info("[Finished]")
Esempio n. 2
0
def main(argv):
    config = epl.Config({"cluster.colocate_split_and_replicate": True})
    epl.init(config)
    FLAGS.worker_id = epl.Env.get().cluster.worker_index
    FLAGS.worker_gpu = epl.Env.get().cluster.total_gpu_num
    epl.set_default_strategy(epl.replicate(FLAGS.worker_gpu))

    # Create HParams.
    if argv:
        set_hparams_from_args(argv[1:])
    if FLAGS.schedule != "run_std_server":
        hparams = create_hparams()

    if FLAGS.schedule == "train":
        mlperf_log.transformer_print(key=mlperf_log.RUN_START)
    else:
        raise RuntimeError(
            "Support training tasks only for now, you can define tasks in other modes."
        )
    trainer_lib.set_random_seed(FLAGS.random_seed)

    hparams.add_hparam("data_dir", FLAGS.data_dir)
    hparams.add_hparam("schedule", FLAGS.schedule)
    hparams.add_hparam("train_steps", FLAGS.train_steps)
    hparams.add_hparam("warm_start_from", None)
    trainer_lib.add_problem_hparams(hparams, FLAGS.problem)

    # Dataset generation.
    if FLAGS.generate_data:
        generate_data()

    def model_fn_replicate(features, labels, mode):
        model_fn = t2t_model.T2TModel.make_estimator_model_fn(
            FLAGS.model, hparams)
        return model_fn(features, labels, mode)

    if is_chief():
        save_metadata(hparams)

    estimator = tf.estimator.Estimator(model_fn=model_fn_replicate,
                                       config=create_run_config())
    hooks = []
    hooks.append(
        tf.train.StepCounterHook(every_n_steps=FLAGS.log_step_count_steps))

    optimize.log_variable_sizes(verbose=True)

    problem = hparams.problem
    train_input_fn = problem.make_estimator_input_fn(
        tf.estimator.ModeKeys.TRAIN, hparams)

    estimator.train(train_input_fn, max_steps=hparams.train_steps, hooks=hooks)
Esempio n. 3
0
    hooks = []
    hooks = [tf.train.StopAtStepHook(last_step=20)]
    with tf.train.MonitoredTrainingSession(hooks=hooks) as sess:
        while not sess.should_stop():
            starttime = time.time()
            _, _, step = sess.run([loss, train_op, global_step])
            endtime = time.time()
            tf.logging.info("[Iteration {} ], Time: {:.4} .".format(
                step, endtime - starttime))
    tf.logging.info("[Finished]")


if __name__ == '__main__':
    tf.logging.set_verbosity(tf.logging.INFO)
    config_json = {}
    if FLAGS.gc:
        config_json["gradient_checkpoint.type"] = "auto"
    if FLAGS.amp:
        config_json["amp.level"] = "o1"
        config_json["amp.loss_scale"] = 10000
        config_json["amp.debug_log"] = True
    if FLAGS.zero:
        config_json["zero.level"] = "v1"
    epl.init(epl.Config(config_json))
    if epl.Env.get().cluster.gpu_num_per_worker > 1:
        # Avoid NCCL hang.
        os.environ["NCCL_LAUNCH_MODE"] = "GROUP"
    epl.set_default_strategy(epl.replicate(device_count=1))
    run_model()
Esempio n. 4
0
def transformer_model(input_tensor,
                      attention_mask=None,
                      hidden_size=768,
                      num_hidden_layers=12,
                      num_attention_heads=12,
                      intermediate_size=3072,
                      intermediate_act_fn=gelu,
                      hidden_dropout_prob=0.1,
                      attention_probs_dropout_prob=0.1,
                      initializer_range=0.02,
                      do_return_all_layers=False):
    """Multi-headed, multi-layer Transformer from "Attention is All You Need".

  This is almost an exact implementation of the original Transformer encoder.

  See the original paper:
  https://arxiv.org/abs/1706.03762

  Also see:
  https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
    attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
      seq_length], with 1 for positions that can be attended to and 0 in
      positions that should not be.
    hidden_size: int. Hidden size of the Transformer.
    num_hidden_layers: int. Number of layers (blocks) in the Transformer.
    num_attention_heads: int. Number of attention heads in the Transformer.
    intermediate_size: int. The size of the "intermediate" (a.k.a., feed
      forward) layer.
    intermediate_act_fn: function. The non-linear activation function to apply
      to the output of the intermediate/feed-forward layer.
    hidden_dropout_prob: float. Dropout probability for the hidden layers.
    attention_probs_dropout_prob: float. Dropout probability of the attention
      probabilities.
    initializer_range: float. Range of the initializer (stddev of truncated
      normal).
    do_return_all_layers: Whether to also return all layers or just the final
      layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.

  Raises:
    ValueError: A Tensor shape or parameter is invalid.
  """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            "The hidden size (%d) is not a multiple of the number of attention "
            "heads (%d)" % (hidden_size, num_attention_heads))

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    # The Transformer performs sum residuals on all layers so the input needs
    # to be the same as the hidden size.
    if input_width != hidden_size:
        raise ValueError(
            "The width of the input tensor (%d) != hidden size (%d)" %
            (input_width, hidden_size))

    # We keep the representation as a 2D tensor to avoid re-shaping it back and
    # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on
    # the GPU/CPU but may not be free on the TPU, so we want to minimize them to
    # help the optimizer.
    prev_output = reshape_to_matrix(input_tensor)

    print(FLAGS.num_pipe_stages)
    num_layer_per_stage = num_hidden_layers // FLAGS.num_pipe_stages

    all_layer_outputs = []
    for layer_idx in range(num_hidden_layers):
        if FLAGS.num_pipe_stages > 1 and layer_idx % num_layer_per_stage == 0 and not epl.Env.get(
        ).config.auto.auto_parallel:
            epl.set_default_strategy(epl.replicate(1))
        with tf.variable_scope("layer_%d" % layer_idx):
            layer_input = prev_output

            with tf.variable_scope("attention"):
                attention_heads = []
                with tf.variable_scope("self"):
                    attention_head = attention_layer(
                        from_tensor=layer_input,
                        to_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        attention_probs_dropout_prob=
                        attention_probs_dropout_prob,
                        initializer_range=initializer_range,
                        do_return_2d_tensor=True,
                        batch_size=batch_size,
                        from_seq_length=seq_length,
                        to_seq_length=seq_length)
                    attention_heads.append(attention_head)

                attention_output = None
                if len(attention_heads) == 1:
                    attention_output = attention_heads[0]
                else:
                    # In the case where we have other sequences, we just concatenate
                    # them to the self-attention head before the projection.
                    attention_output = tf.concat(attention_heads, axis=-1)

                # Run a linear projection of `hidden_size` then add a residual
                # with `layer_input`.
                with tf.variable_scope("output"):
                    attention_output = tf.layers.dense(
                        attention_output,
                        hidden_size,
                        kernel_initializer=create_initializer(
                            initializer_range))
                    attention_output = dropout(attention_output,
                                               hidden_dropout_prob)
                    attention_output = layer_norm(attention_output +
                                                  layer_input)

            # The activation is only applied to the "intermediate" hidden layer.
            with tf.variable_scope("intermediate"):
                intermediate_output = tf.layers.dense(
                    attention_output,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=create_initializer(initializer_range))

            # Down-project back to `hidden_size` then add the residual.
            with tf.variable_scope("output"):
                layer_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=create_initializer(initializer_range))
                layer_output = dropout(layer_output, hidden_dropout_prob)
                layer_output = layer_norm(layer_output + attention_output)
                prev_output = layer_output
                all_layer_outputs.append(layer_output)

    if do_return_all_layers:
        final_outputs = []
        for layer_output in all_layer_outputs:
            final_output = reshape_from_matrix(layer_output, input_shape)
            final_outputs.append(final_output)
        return final_outputs
    else:
        final_output = reshape_from_matrix(prev_output, input_shape)
        return final_output