def run_model(total_gpu_num): """Train model.""" with epl.replicate(total_gpu_num): iterator = get_mock_iterator() images, labels = iterator.get_next() features = resnet_v1.resnet_v1_50(images, num_classes=None, is_training=True)[0] features = tf.squeeze(features, [1, 2]) with epl.split(total_gpu_num): logits = tf.layers.dense(features, class_num) loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) global_step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer(learning_rate=0.9) train_op = optimizer.minimize(loss, global_step=global_step) hooks = [tf.train.StopAtStepHook(last_step=20)] with tf.train.MonitoredTrainingSession(hooks=hooks) as sess: while not sess.should_stop(): starttime = time.time() _, _, step = sess.run([loss, train_op, global_step]) endtime = time.time() tf.logging.info("[Iteration {} ], Time: {:.4} .".format( step, endtime - starttime)) tf.logging.info("[Finished]")
def main(argv): config = epl.Config({"cluster.colocate_split_and_replicate": True}) epl.init(config) FLAGS.worker_id = epl.Env.get().cluster.worker_index FLAGS.worker_gpu = epl.Env.get().cluster.total_gpu_num epl.set_default_strategy(epl.replicate(FLAGS.worker_gpu)) # Create HParams. if argv: set_hparams_from_args(argv[1:]) if FLAGS.schedule != "run_std_server": hparams = create_hparams() if FLAGS.schedule == "train": mlperf_log.transformer_print(key=mlperf_log.RUN_START) else: raise RuntimeError( "Support training tasks only for now, you can define tasks in other modes." ) trainer_lib.set_random_seed(FLAGS.random_seed) hparams.add_hparam("data_dir", FLAGS.data_dir) hparams.add_hparam("schedule", FLAGS.schedule) hparams.add_hparam("train_steps", FLAGS.train_steps) hparams.add_hparam("warm_start_from", None) trainer_lib.add_problem_hparams(hparams, FLAGS.problem) # Dataset generation. if FLAGS.generate_data: generate_data() def model_fn_replicate(features, labels, mode): model_fn = t2t_model.T2TModel.make_estimator_model_fn( FLAGS.model, hparams) return model_fn(features, labels, mode) if is_chief(): save_metadata(hparams) estimator = tf.estimator.Estimator(model_fn=model_fn_replicate, config=create_run_config()) hooks = [] hooks.append( tf.train.StepCounterHook(every_n_steps=FLAGS.log_step_count_steps)) optimize.log_variable_sizes(verbose=True) problem = hparams.problem train_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.TRAIN, hparams) estimator.train(train_input_fn, max_steps=hparams.train_steps, hooks=hooks)
hooks = [] hooks = [tf.train.StopAtStepHook(last_step=20)] with tf.train.MonitoredTrainingSession(hooks=hooks) as sess: while not sess.should_stop(): starttime = time.time() _, _, step = sess.run([loss, train_op, global_step]) endtime = time.time() tf.logging.info("[Iteration {} ], Time: {:.4} .".format( step, endtime - starttime)) tf.logging.info("[Finished]") if __name__ == '__main__': tf.logging.set_verbosity(tf.logging.INFO) config_json = {} if FLAGS.gc: config_json["gradient_checkpoint.type"] = "auto" if FLAGS.amp: config_json["amp.level"] = "o1" config_json["amp.loss_scale"] = 10000 config_json["amp.debug_log"] = True if FLAGS.zero: config_json["zero.level"] = "v1" epl.init(epl.Config(config_json)) if epl.Env.get().cluster.gpu_num_per_worker > 1: # Avoid NCCL hang. os.environ["NCCL_LAUNCH_MODE"] = "GROUP" epl.set_default_strategy(epl.replicate(device_count=1)) run_model()
def transformer_model(input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): """Multi-headed, multi-layer Transformer from "Attention is All You Need". This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] # The Transformer performs sum residuals on all layers so the input needs # to be the same as the hidden size. if input_width != hidden_size: raise ValueError( "The width of the input tensor (%d) != hidden size (%d)" % (input_width, hidden_size)) # We keep the representation as a 2D tensor to avoid re-shaping it back and # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on # the GPU/CPU but may not be free on the TPU, so we want to minimize them to # help the optimizer. prev_output = reshape_to_matrix(input_tensor) print(FLAGS.num_pipe_stages) num_layer_per_stage = num_hidden_layers // FLAGS.num_pipe_stages all_layer_outputs = [] for layer_idx in range(num_hidden_layers): if FLAGS.num_pipe_stages > 1 and layer_idx % num_layer_per_stage == 0 and not epl.Env.get( ).config.auto.auto_parallel: epl.set_default_strategy(epl.replicate(1)) with tf.variable_scope("layer_%d" % layer_idx): layer_input = prev_output with tf.variable_scope("attention"): attention_heads = [] with tf.variable_scope("self"): attention_head = attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob= attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=create_initializer( initializer_range)) attention_output = dropout(attention_output, hidden_dropout_prob) attention_output = layer_norm(attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=create_initializer(initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=create_initializer(initializer_range)) layer_output = dropout(layer_output, hidden_dropout_prob) layer_output = layer_norm(layer_output + attention_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: final_outputs = [] for layer_output in all_layer_outputs: final_output = reshape_from_matrix(layer_output, input_shape) final_outputs.append(final_output) return final_outputs else: final_output = reshape_from_matrix(prev_output, input_shape) return final_output