def get_train_op_and_metrics(loss, params): """Generate training op and metrics to save in TensorBoard.""" with tf.variable_scope("get_train_op"): learning_rate = get_learning_rate( learning_rate=params["learning_rate"], hidden_size=params["hidden_size"], learning_rate_warmup_steps=params["learning_rate_warmup_steps"]) # Create optimizer. Use LazyAdamOptimizer from TF contrib, which is faster # than the TF core Adam optimizer. from tensorflow.contrib import opt as contrib_opt # pylint: disable=g-import-not-at-top optimizer = contrib_opt.LazyAdamOptimizer( learning_rate, beta1=params["optimizer_adam_beta1"], beta2=params["optimizer_adam_beta2"], epsilon=params["optimizer_adam_epsilon"]) if params["use_tpu"] and params["tpu"] != tpu_util.LOCAL: optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer) if params["distribution_strategy"] == "horovod": import horovod.tensorflow as hvd optimizer = hvd.DistributedOptimizer(optimizer) if params["distribution_strategy"] == "byteps": import byteps.tensorflow as bps optimizer = bps.DistributedOptimizer(optimizer) # Uses automatic mixed precision FP16 training if on GPU. if params["dtype"] == "fp16": optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer) # Calculate and apply gradients using LazyAdamOptimizer. global_step = tf.train.get_global_step() tvars = tf.trainable_variables() gradients = optimizer.compute_gradients( loss, tvars, colocate_gradients_with_ops=True) minimize_op = optimizer.apply_gradients( gradients, global_step=global_step, name="train") update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) train_op = tf.group(minimize_op, update_ops) train_metrics = {"learning_rate": learning_rate} if not params["use_tpu"]: # gradient norm is not included as a summary when running on TPU, as # it can cause instability between the TPU and the host controller. gradient_norm = tf.global_norm(list(zip(*gradients))[0]) train_metrics["global_norm/gradient_norm"] = gradient_norm return train_op, train_metrics
def main(_): # BytePS: initialize BytePS. bps.init() # Keras automatically creates a cache directory in ~/.keras/datasets for # storing the downloaded MNIST data. This creates a race # condition among the workers that share the same filesystem. If the # directory already exists by the time this worker gets around to creating # it, ignore the resulting exception and continue. cache_dir = os.path.join(os.path.expanduser('~'), '.keras', 'datasets') if not os.path.exists(cache_dir): try: os.mkdir(cache_dir) except OSError as e: if e.errno == errno.EEXIST and os.path.isdir(cache_dir): pass else: raise # Download and load MNIST dataset. (x_train, y_train), (x_test, y_test) = \ keras.datasets.mnist.load_data('MNIST-data-%d' % bps.rank()) # The shape of downloaded data is (-1, 28, 28), hence we need to reshape it # into (-1, 784) to feed into our network. Also, need to normalize the # features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32, [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) # BytePS: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * bps.size()) # BytePS: add BytePS Distributed Optimizer. opt = bps.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op = opt.minimize(loss, global_step=global_step) hooks = [ # BytePS: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. bps.BroadcastGlobalVariablesHook(0), # BytePS: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=200000 // bps.size()), tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ] # BytePS: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(bps.local_rank()) # BytePS: save checkpoints only on worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints' if bps.rank() == 0 else None training_batch_generator = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization, # restoring from a checkpoint, saving to a checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_ = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_, label: label_})
config.gpu_options.visible_device_list = '' if args.eager: tf.enable_eager_execution(config) # Set up standard model. # Check https://github.com/keras-team/keras-applications for all supported models, e.g., ResNet50, VGG16 model = getattr(applications, args.model)(weights=None) opt = tf.train.GradientDescentOptimizer(0.01) # BytePS: (optional) compression algorithm. compression = bps.Compression.fp16 if args.fp16_pushpull else bps.Compression.none # BytePS: wrap optimizer with DistributedOptimizer. opt = bps.DistributedOptimizer(opt, compression=compression) init = tf.global_variables_initializer() bcast_op = bps.broadcast_global_variables(0) data = tf.random_uniform([args.batch_size, 224, 224, 3]) target = tf.random_uniform([args.batch_size, 1], minval=0, maxval=999, dtype=tf.int64) def loss_function(): logits = model(data, training=True) return tf.losses.sparse_softmax_cross_entropy(target, logits) def log(s, nl=True): if bps.rank() != 0:
def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): """Creates an optimizer training op.""" global_step = tf.train.get_or_create_global_step() learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) # Implements linear decay of the learning rate. learning_rate = tf.train.polynomial_decay(learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) # Implements linear warmup. I.e., if global_step < num_warmup_steps, the # learning rate will be `global_step/num_warmup_steps * init_lr`. if num_warmup_steps: global_steps_int = tf.cast(global_step, tf.int32) warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) global_steps_float = tf.cast(global_steps_int, tf.float32) warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) warmup_percent_done = global_steps_float / warmup_steps_float warmup_learning_rate = init_lr * warmup_percent_done is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) # It is recommended that you use this optimizer for fine tuning, since this # is how the model was trained (note that the Adam m/v variables are NOT # loaded from init_checkpoint.) optimizer = AdamWeightDecayOptimizer( learning_rate=learning_rate, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) if os.environ.get("USE_BYTEPS") and os.environ.get( "USE_BYTEPS").upper() in ["1", "TRUE", "Y"]: print("=================USING DISTRIBUTED OPTIMIZER=================") optimizer = bps.DistributedOptimizer(optimizer) tvars = tf.trainable_variables() grads = optimizer.compute_gradients(loss, tvars) # This is how the model was pre-trained. (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) trace_dir = os.path.join(os.environ.get("BYTEPS_TRACE_DIR", "."), str(bps.local_rank())) dump_computation_graph(trace_dir) train_op = optimizer.apply_gradients(zip(grads, tvars), global_step=global_step) # Normally the global step update is done inside of `apply_gradients`. # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use # a different optimizer, you should probably take this line out. new_global_step = global_step + 1 train_op = tf.group(train_op, [global_step.assign(new_global_step)]) return train_op