def _step(self, samples, labels, first_batch): self._lr_scheduler() with tf.GradientTape() as tape: probs = self._model(samples, training=True) loss = self._loss_fn(labels, probs) if self._amp: loss = self._embedding_optimizer.get_scaled_loss(loss) embedding_vars, dense_vars = sok.split_embedding_variable_from_others(self._model.trainable_variables) embedding_grads, dense_grads = tape.gradient(loss, [embedding_vars, dense_vars]) if self._amp: embedding_grads = self._embedding_optimizer.get_unscaled_gradients(embedding_grads) dense_grads = self._embedding_optimizer.get_unscaled_gradients(dense_grads) # embedding_grads = [scale_grad(g, hvd.size()) for g in embedding_grads] with sok.OptimizerScope(embedding_vars): self._embedding_optimizer.apply_gradients(zip(embedding_grads, embedding_vars), experimental_aggregate_gradients=False) # with tf.control_dependencies(embedding_grads): dense_grads = [hvd.allreduce(grad, op=hvd.Average, compression=hvd.compression.NoneCompressor) for grad in dense_grads] self._dense_optimizer.apply_gradients(zip(dense_grads, dense_vars), experimental_aggregate_gradients=False) if first_batch: hvd.broadcast_variables(dense_vars, root_rank=0) hvd.broadcast_variables(self._dense_optimizer.variables(), root_rank=0) return loss
def train_step(features, labels, warmup_batch=False): with tf.GradientTape() as tape: output_map = model(features) crossentropy_loss, dice_loss = partial_losses(output_map, labels) added_losses = tf.add(crossentropy_loss, dice_loss, name="total_loss_ref") loss = added_losses + params.weight_decay * tf.add_n([ tf.nn.l2_loss(v) for v in model.trainable_variables if 'batch_normalization' not in v.name ]) if params.use_amp: loss = optimizer.get_scaled_loss(loss) tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(loss, model.trainable_variables) if params.use_amp: gradients = optimizer.get_unscaled_gradients(gradients) optimizer.apply_gradients(zip(gradients, model.trainable_variables)) # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if warmup_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) ce_loss(crossentropy_loss) f1_loss(dice_loss) return loss
def train_step(images, masks, first_batch=False): with tf.GradientTape() as tape: predicted = model(images) predicted = predicted[:, clip_offset:- clip_offset, clip_offset:-clip_offset] masks = masks[:, clip_offset:- clip_offset, clip_offset:-clip_offset] loss = bce(masks, predicted) train_loss_metric.update_state(loss) tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient( loss, model.trainable_variables) optimizer.apply_gradients( zip(gradients, model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) return predicted, masks
def train_step(self, data): """Perform a single training step.""" x, beta = data start = time.time() with tf.GradientTape() as tape: states, accept_prob, sumlogdet = self((x, beta), training=True) loss = self.calc_losses(states, accept_prob) if self.aux_weight > 0: z = tf.random.normal(x.shape, dtype=x.dtype) states_, accept_prob_, _ = self((z, beta), training=True) loss_ = self.calc_losses(states_, accept_prob_) loss += loss_ if NUM_RANKS > 1: tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss, self.trainable_variables) self.optimizer.apply_gradients(zip(grads, self.trainable_variables)) metrics = AttrDict({ 'dt': time.time() - start, 'loss': loss, 'accept_prob': accept_prob, 'eps': self.eps, 'beta': states.init.beta, 'sumlogdet': sumlogdet.out, }) if self.optimizer.iterations == 0 and NUM_RANKS > 1: hvd.broadcast_variables(self.variables, root_rank=0) hvd.broadcast_variables(self.optimizer.variables(), root_rank=0) return states.out.x, metrics
def train_step(inputs_tr, targets_tr, first_batch): print("Tracing update_step") print("inputs nodes", inputs_tr.nodes.shape) print("inputs edges", inputs_tr.edges.shape) print("input n_node", inputs_tr.n_node.shape) print(inputs_tr.nodes) with tf.GradientTape() as tape: outputs_tr = model(inputs_tr, num_processing_steps_tr, is_training=True) loss_ops_tr = loss_fcn(targets_tr, outputs_tr) loss_op_tr = tf.math.reduce_sum(loss_ops_tr) / tf.constant( num_processing_steps_tr, dtype=tf.float32) # Horovod: add Horovod Distributed GradientTape. if args.distributed: tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(loss_op_tr, model.trainable_variables) optimizer.apply(gradients, model.trainable_variables) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if args.distributed and first_batch: hvd.broadcast_variables(model.trainable_variables, root_rank=0) hvd.broadcast_variables(optimizer.variables, root_rank=0) return loss_op_tr
def train_step(inputs, first_batch): images, labels = inputs with tf.GradientTape() as tape: predictions = model(images, training=True) loss = loss_func(labels, predictions) loss += tf.reduce_sum(model.losses) loss_copy = loss # Scale the losses if precision == 'fp16': loss = loss * tf.cast(loss_scale, loss.dtype) tape = hvd.DistributedGradientTape(tape) old_grads = tape.gradient(loss, model.trainable_variables) # Unscale the grads if precision == 'fp16': loss_scale_reciprocal = 1. / loss_scale grads = [g * tf.cast(loss_scale_reciprocal, g.dtype) if g is not None else None for g in old_grads] else: grads = old_grads opt.apply_gradients(zip(grads, model.trainable_variables)) train_top1.update_state(labels, predictions) train_top5.update_state(labels, predictions) if hvd.size() > 1 and first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return loss_copy
def training_step(images, labels, first_batch): with tf.GradientTape() as tape: probs = cifar10_model(images, training=True) loss_value = loss(labels, probs) # Horovod: add Horovod Distributed GradientTape. try: tape = hvd.DistributedGradientTape(tape) except: print("no horovod") grads = tape.gradient(loss_value, cifar10_model.trainable_variables) opt.apply_gradients(zip(grads, cifar10_model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: try: hvd.broadcast_variables(cifar10_model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) except: print("no horovod") return loss_value
def train_step(model, opt, loss_func, images, labels, first_batch, batch_size, mixup_alpha=0.0, fp32=False): images, labels = mixup(batch_size, mixup_alpha, images, labels) with tf.GradientTape() as tape: logits = model(images, training=True) loss_value = loss_func(labels, tf.cast(logits, tf.float32)) loss_value += tf.add_n(model.losses) if not fp32: scaled_loss_value = opt.get_scaled_loss(loss_value) tape = hvd.DistributedGradientTape(tape, compression=hvd.Compression.fp16) if not fp32: grads = tape.gradient(scaled_loss_value, model.trainable_variables) grads = opt.get_unscaled_gradients(grads) else: grads = tape.gradient(loss_value, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) probs = layers.Activation('softmax', dtype='float32')(logits) top_1_pred = tf.squeeze(tf.math.top_k(probs, k=1)[1]) sparse_labels = tf.cast(tf.math.argmax(labels, axis=1), tf.int32) top_1_accuracy = tf.math.reduce_sum(tf.cast(tf.equal(top_1_pred, sparse_labels), tf.int32)) return loss_value, top_1_accuracy
def train(self, dataset, total_batches=-1): """ Update the model in 1 epoch """ train_step = self.train_step if self.hparams.enable_tf_function: logging.info( "please be patient, enable tf.function, it takes time ...") train_step = tf.function(train_step, input_signature=self.sample_signature) for batch, samples in enumerate(dataset.take(total_batches)): # train 1 step samples = self.model.prepare_samples(samples) loss, metrics = train_step(samples) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if batch == 0: hvd.broadcast_variables(self.model.trainable_variables, root_rank=0) hvd.broadcast_variables(self.optimizer.variables(), root_rank=0) if batch % self.hparams.log_interval == 0 and hvd.rank() == 0: logging.info(self.metric_checker(loss, metrics)) self.model.reset_metrics()
def _train_step(inputs, labels, first_batch): with tf.GradientTape() as tape: logit, all_vectors = model(inputs, training=True) replica_loss = _replica_loss(labels, logit) if args.mixed_precision: _loss = emb_opt.get_scaled_loss(replica_loss) else: _loss = replica_loss emb_var, other_var = sok.split_embedding_variable_from_others( model.trainable_variables) emb_grads, grads = tape.gradient(_loss, [emb_var, other_var]) if args.mixed_precision: emb_grads = emb_opt.get_unscaled_gradients(emb_grads) grads = emb_opt.get_unscaled_gradients(grads) if "plugin" not in args.optimizer: with sok.OptimizerScope(emb_var): emb_opt.apply_gradients(zip(emb_grads, emb_var), experimental_aggregate_gradients=False) else: emb_opt.apply_gradients(zip(emb_grads, emb_var), experimental_aggregate_gradients=False) with tf.control_dependencies(emb_grads): grads = [hvd.allreduce(grad) for grad in grads] dense_opt.apply_gradients(zip(grads, other_var)) if first_batch: hvd.broadcast_variables(other_var, root_rank=0) hvd.broadcast_variables(dense_opt.variables(), root_rank=0) total_loss = hvd.allreduce(replica_loss) return total_loss, all_vectors
def benchmark_step(first_batch): # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # Horovod: use DistributedGradientTape with tf.GradientTape() as tape: probs = model(data, training=True) loss = tf.losses.sparse_categorical_crossentropy(target, probs) if args.use_amp: loss = opt.get_scaled_loss(loss) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape, compression=compression) gradients = tape.gradient(loss, model.trainable_variables) if args.use_amp: gradients = opt.get_unscaled_gradients(gradients) opt.apply_gradients(zip(gradients, model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0)
def train_step(model, inputs, loss, amp, opt, init): with tf.GradientTape() as tape: [input_ids, input_mask, segment_ids, label_ids] = inputs # print(input_ids, input_ids.shape) outputs = model( input_ids, # input_ids=input_ids, attention_mask=input_mask, token_type_ids=segment_ids, position_ids=None, head_mask=None, inputs_embeds=None, training=True, ) loss_value = loss(y_true=label_ids, y_pred=outputs[0]) unscaled_loss = tf.stop_gradient(loss_value) if amp: loss_value = opt.get_scaled_loss(loss_value) tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(loss_value, model.trainable_variables) if amp: gradients = opt.get_unscaled_gradients(gradients) opt.apply_gradients(zip(gradients, model.trainable_variables)) # , clip_norm=1.0) if init: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return unscaled_loss, outputs # , tape.gradient(loss_value, model.trainable_variables)
def train_step(model, inputs, loss, amp, opt, init, v2=False, loss_class=None, fp16=False, clip_norm=1.0): with tf.GradientTape() as tape: [input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible] = inputs if not v2: is_impossible = None start_logits, end_logits, cls_logits = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, start_positions=start_positions, end_positions=end_positions, cls_index=cls_index, p_mask=p_mask, is_impossible=is_impossible, position_ids=None, head_mask=None, inputs_embeds=None, training=True, )[0:3] # If we are on multi-GPU, split add a dimension if len(start_positions.shape) > 1: start_positions = tf.squeeze(start_positions, axis=-1, name="squeeze_start_positions") if len(end_positions.shape) > 1: end_positions = tf.squeeze(end_positions, axis=-1, name="squeeze_end_positions") if is_impossible is not None and len(is_impossible.shape) > 1 and v2 and cls_logits is not None: is_impossible = tf.squeeze(is_impossible, axis=-1, name="squeeze_is_impossible") # sometimes the start/end positions are outside our model inputs, we ignore these terms ignored_index = start_logits.shape[1] start_positions = tf.clip_by_value(start_positions, 0, ignored_index, name="clip_start_positions") end_positions = tf.clip_by_value(end_positions, 0, ignored_index, name="clip_end_positions") start_loss = loss(y_true=start_positions, y_pred=tf.cast(start_logits, tf.float32)) end_loss = loss(y_true=end_positions, y_pred=tf.cast(end_logits, tf.float32)) loss_value = (start_loss + end_loss) / 2 if v2: cls_loss_value = loss_class(y_true=is_impossible, y_pred=tf.cast(cls_logits, tf.float32)) loss_value += cls_loss_value * 0.5 unscaled_loss = tf.stop_gradient(loss_value) if amp: loss_value = opt.get_scaled_loss(loss_value) tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True, compression=Compression.fp16 if fp16 else Compression.none) gradients = tape.gradient(loss_value, model.trainable_variables) if amp: gradients = opt.get_unscaled_gradients(gradients) (gradients, _) = tf.clip_by_global_norm(gradients, clip_norm=clip_norm) opt.apply_gradients(zip(gradients, model.trainable_variables)) # , clip_norm=1.0) if init: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return unscaled_loss # , outputs#, tape.gradient(loss_value, model.trainable_variables)
def main(_): # Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used to process local rank (one GPU per process) config = tf.ConfigProto() config.gpu_options.visible_device_list = str(hvd.local_rank()) tf.enable_eager_execution(config=config) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.Conv2D(16, [3, 3], activation='relu'), tf.keras.layers.GlobalAveragePooling2D(), tf.keras.layers.Dense(10) ]) # Horovod: adjust learning rate based on number of GPUs. opt = tf.train.RMSPropOptimizer(0.001 * hvd.size()) (mnist_images, mnist_labels), _ = tf.keras.datasets.mnist.load_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255, tf.float32), tf.cast(mnist_labels, tf.int64))) dataset = dataset.shuffle(1000).batch(32) # Horovod: save checkpoints only on worker 0 to prevent other workers from checkpoint_dir = './checkpoints' step_counter = tf.train.get_or_create_global_step() checkpoint = tf.train.Checkpoint(model=mnist_model, optimizer=opt, step_counter=step_counter) # Horovod: adjust number of steps based on number of GPUs. for (batch, (images, labels)) in enumerate(dataset.take(20000 // hvd.size())): with tf.GradientTape() as tape: logits = mnist_model(images, training=True) loss_value = tf.losses.sparse_softmax_cross_entropy(labels, logits) # Horovod: broadcast initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. if batch == 0: hvd.broadcast_variables(0, mnist_model.variables) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.variables) opt.apply_gradients(zip(grads, mnist_model.variables), global_step=tf.train.get_or_create_global_step()) if batch % 10 == 0 and hvd.local_rank() == 0: print('Step #%d\tLoss: %.6f' % (batch, loss_value)) if hvd.rank() == 0: checkpoint.save(checkpoint_dir)
def train_first_step(inputs): images, labels = inputs with tf.GradientTape() as tape: probs = model(images, training=True) loss_value = loss(labels, probs) tape = hvd_tf.DistributedGradientTape(tape, compression=compression) grads = tape.gradient(loss_value, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) hvd_tf.broadcast_variables(model.variables, root_rank=0) hvd_tf.broadcast_variables(opt.variables(), root_rank=0)
def train_step(model, opt, loss_func, images, labels, first_batch): with tf.GradientTape() as tape: probs = model(images, training=True) loss_value = loss_func(labels, probs) tape = hvd.DistributedGradientTape(tape, compression=hvd.Compression.fp16) grads = tape.gradient(loss_value, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) return loss_value
def initialize(self, io_only=False): tf_trainer.initialize(self, io_only) # Here, we broadcast parameters from rank 0. # If the model was restored, this is correct. If not, # This syncs everythign up. # print(bcast) hvd.broadcast_variables(self._net.variables, root_rank=0) hvd.broadcast_variables(self._opt.variables(), root_rank=0)
def train_one_step(config, model, optimizer, features, accumulator, first_step, take_step, clip_norm=1.0): #Forward and Backward pass with tf.GradientTape() as tape: total_loss, eval_fn_inputs = model(features, is_training=True) unscaled_loss = tf.stop_gradient(total_loss) if config.amp: total_loss = optimizer.get_scaled_loss(total_loss) #Backpropogate gradients #tape = hvd.DistributedGradientTape( # tape, sparse_as_dense=True, # compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none) gradients = tape.gradient(total_loss, model.trainable_variables) #Get unscaled gradients if AMP if config.amp: gradients = optimizer.get_unscaled_gradients(gradients) #Accumulate gradients accumulator(gradients) #Need to call apply_gradients on very first step irrespective of gradient accumulation #This is required for the optimizer to build it's states if first_step or take_step: #All reduce and Clip the accumulated gradients allreduced_accumulated_gradients = [ None if g is None else hvd.allreduce( g / tf.cast(config.gradient_accumulation_steps, g.dtype), compression=Compression.fp16 if config.amp and config.fp16_compression else Compression.none) for g in accumulator.gradients ] (clipped_accumulated_gradients, _) = tf.clip_by_global_norm(allreduced_accumulated_gradients, clip_norm=clip_norm) #Weight update optimizer.apply_gradients( zip(clipped_accumulated_gradients, model.trainable_variables)) accumulator.reset() #brodcast model weights after first train step if first_step: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) return unscaled_loss, eval_fn_inputs
def train_step(images, labels, first_batch): gradients, predictions, loss = get_grads(images, labels, first_batch) gradients = [hvd.allreduce(g.reduce_mean()) for g in gradients] optimizer.apply_gradients(zip(gradients, model.trainable_variables)) if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) train_loss(loss.reduce_mean()) train_accuracy(labels, predictions.merge()) return loss.reduce_mean()
def train_step(self, first_epoch): epoch_global_norm = tf.TensorArray( tf.float32, size=self.params['dataloader']["number_of_elements"], dynamic_size=False, clear_after_read=False, ) epoch_loss_avg = tf.TensorArray( tf.float32, size=self.params['dataloader']["number_of_elements"], dynamic_size=False, clear_after_read=False, ) for element in self.train_dataset.enumerate(): index = tf.dtypes.cast(element[0], tf.int32) set = element[1] shape = [ self.params['dataloader']['batch_size'], self.pixel_num, self.params['dataloader']['tomographic_bin_number'] ] kappa_data = tf.boolean_mask(tf.transpose(set[0], perm=[0, 2, 1]), self.bool_mask, axis=1) kappa_data = tf.ensure_shape(kappa_data, shape) labels = set[1] # Add noise noise = tf.ensure_shape(self._make_noise(), shape) kappa_data = tf.math.add(kappa_data, noise) # Optimize the model with tf.GradientTape() as tape: loss_object = tf.keras.losses.MeanAbsoluteError() y_ = self.model.__call__(kappa_data, training=True) loss_value = loss_object(y_true=labels, y_pred=y_) if self.params['training']['distributed']: tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, self.model.trainable_variables) self.optimizer.apply_gradients( zip(grads, self.model.trainable_variables)) if self.params['training'][ 'distributed'] and index == 0 and first_epoch: hvd.broadcast_variables(self.model.variables, root_rank=0) hvd.broadcast_variables(self.optimizer.variables(), root_rank=0) epoch_loss_avg = epoch_loss_avg.write(index, loss_value) epoch_global_norm = epoch_global_norm.write( index, tf.linalg.global_norm(grads)) return epoch_loss_avg.stack(), epoch_global_norm.stack()
def join_and_broadcast(self): hvd.join() if not self.args.benchmark: hvd.broadcast_variables(self.model.linear_model.variables, root_rank=0) hvd.broadcast_variables(self.model.dnn_model.variables, root_rank=0) hvd.broadcast_variables(self.wide_optimizer.variables(), root_rank=0) hvd.broadcast_variables(self.deep_optimizer.variables(), root_rank=0)
def train_GAN(_batch_size, _training_epochs, global_size): tf.keras.mixed_precision.set_global_policy("mixed_float16") generator = Generator() random_input = numpy.random.uniform(-1, 1, [1, 100]).astype(numpy.float16) generated_image = generator(random_input) discriminator = Discriminator() classification = discriminator(generated_image) models = {"generator": generator, "discriminator": discriminator} opts = { "generator": tf.keras.optimizers.Adam(0.001), "discriminator": tf.keras.optimizers.RMSprop(0.0001) } if global_size != 1: hvd.broadcast_variables(generator.variables, root_rank=0) hvd.broadcast_variables(discriminator.variables, root_rank=0) hvd.broadcast_variables(opts['generator'].variables(), root_rank=0) hvd.broadcast_variables(opts['discriminator'].variables(), root_rank=0) train_loop(_batch_size, _training_epochs, models, opts, global_size)
def train_step(x, y, first_batch): with tf.GradientTape(persistent=True) as tape: y_pred = model(x, training=True) loss = compiled_loss(y, y_pred) linear_loss = wide_optimizer.get_scaled_loss( loss) if args.amp else loss deep_loss = deep_optimizer.get_scaled_loss( loss) if args.amp else loss if not args.cpu: tape = hvd.DistributedGradientTape(tape) for metric in metrics: metric.update_state(y, y_pred) linear_vars = model.linear_model.trainable_variables dnn_vars = model.dnn_model.trainable_variables linear_grads = tape.gradient(linear_loss, linear_vars) dnn_grads = tape.gradient(deep_loss, dnn_vars) if args.amp: linear_grads = wide_optimizer.get_unscaled_gradients(linear_grads) dnn_grads = deep_optimizer.get_unscaled_gradients(dnn_grads) wide_optimizer.apply_gradients(zip(linear_grads, linear_vars)) deep_optimizer.apply_gradients(zip(dnn_grads, dnn_vars)) if first_batch and not args.cpu: hvd.broadcast_variables(model.linear_model.variables, root_rank=0) hvd.broadcast_variables(model.dnn_model.variables, root_rank=0) hvd.broadcast_variables(wide_optimizer.variables(), root_rank=0) hvd.broadcast_variables(deep_optimizer.variables(), root_rank=0) return loss
def __call__(self, x, y): with tf.GradientTape(persistent=True) as tape: y_pred = self.model(x, training=True) loss = self.compiled_loss(y, y_pred) linear_loss = ( self.wide_optimizer.get_scaled_loss(loss) if self.args.amp else loss ) deep_loss = ( self.deep_optimizer.get_scaled_loss(loss) if self.args.amp else loss ) if not self.args.cpu: tape = hvd.DistributedGradientTape(tape, sparse_as_dense=True) linear_vars = self.model.linear_model.trainable_variables dnn_vars = self.model.dnn_model.trainable_variables linear_grads = tape.gradient(linear_loss, linear_vars) dnn_grads = tape.gradient(deep_loss, dnn_vars) if self.args.amp: linear_grads = self.wide_optimizer.get_unscaled_gradients(linear_grads) dnn_grads = self.deep_optimizer.get_unscaled_gradients(dnn_grads) self.wide_optimizer.apply_gradients(zip(linear_grads, linear_vars)) self.deep_optimizer.apply_gradients(zip(dnn_grads, dnn_vars)) if self.current_step_var == 0: hvd.broadcast_variables(self.model.linear_model.variables, root_rank=0) hvd.broadcast_variables(self.model.dnn_model.variables, root_rank=0) hvd.broadcast_variables(self.wide_optimizer.variables(), root_rank=0) hvd.broadcast_variables(self.deep_optimizer.variables(), root_rank=0) return loss
def train_one_step(model, opt, x, y, step, loss_func, compression, opts): preprocess = PreProcess(opts) with tf.GradientTape(persistent=True) as tape: logits = model(x, training=True) loss = loss_func(y, logits) # scaled_loss = opt.get_scaled_loss(loss) # Horovod: add Horovod Distributed GradientTape. tape = hvd.DistributedGradientTape(tape, compression=compression, op=hvd.Average) # ,device_sparse='/gpu:2', device_dense='/gpu:2') # scaled_gradients = tape.gradient(scaled_loss, model.trainable_variables) # grads = opt.get_unscaled_gradients(scaled_gradients) if opts.lr_scheduler == 'constant': lr = opts.base_lr elif opts.lr_scheduler == 'cosine': lr = cosine_decay_with_warmup(global_step=step, learning_rate_base=opts.base_lr, total_steps=opts.steps_per_epoch // 2, warmup_learning_rate=opts.warmup_learning_rate, warmup_steps=2 * hvd.size()) elif opts.lr_scheduler == 'cyclic': lr = cyclic_learning_rate(global_step=step, base_lr=opts.min_lr, max_lr=opts.max_lr, step_size=opts.step_size, gamma=opts.gamma) else: raise NotImplementedError('Unsupported learning rate scheduling type') tf.keras.backend.set_value(opt.lr, lr) grads = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) if step == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) if not opts.evaluate: lr = cosine_decay_with_warmup(global_step=step, learning_rate_base=0.001, warmup_learning_rate=0.00001, total_steps=opts.steps_per_epoch // 1, warmup_steps=2*hvd.size()) opt = tf.keras.optimizers.SGD(learning_rate=lr*hvd.size(), momentum=0.9, nesterov=True) grads = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(grads, model.trainable_variables)) if step == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) pred = tf.argmax(logits, axis=-1) del tape return loss, pred, opt
def train_step(self, y_sketch_gt, y_sketch_teacher, x_image, first_step): with tf.GradientTape() as tape: params = self.forward(y_sketch_teacher, x_image, training=True)[:-1] total_loss, pen_loss, offset_loss, pixel_loss, kl_loss = self.compute_loss(params, y_sketch_gt, x_image) if self._distributed: tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(total_loss, self._encoder.trainable_variables + self._decoder.trainable_variables) self._optimizer.apply_gradients(zip(grads, self._encoder.trainable_variables + self._decoder.trainable_variables)) if self._distributed and first_step: hvd.broadcast_variables(self._encoder.trainable_variables + self._decoder.trainable_variables, root_rank=0) hvd.broadcast_variables(self._optimizer.variables(), root_rank=0) return total_loss, pen_loss, offset_loss, pixel_loss, kl_loss
def on_batch_end(self, batch, logs=None): if self.broadcast_done: return with tf.device(self.device): if hvd._executing_eagerly() and hasattr(self.model, 'variables'): # TensorFlow 2.0 or TensorFlow eager hvd.broadcast_variables(self.model.variables, root_rank=self.root_rank) hvd.broadcast_variables(self.model.optimizer.variables(), root_rank=self.root_rank) else: bcast_op = hvd.broadcast_global_variables(self.root_rank) self.backend.get_session().run(bcast_op) self.broadcast_done = True
def benchmark_step(first_batch): with tf.GradientTape() as tape: probs = model(data, training=True) loss = tf.losses.categorical_crossentropy(target, probs) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # # Note: broadcast should be done after the first gradient step to ensure optimizer # initialization. if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0)
def benchmark_step(dataset_inputs, first_batch=False): x_input, y_label = dataset_inputs y_label = tf.reshape(y_label, (y_label[0], 1)) with tf.GradientTape() as tape: prediction = model(x_input, training=True) loss = tf.losses.sparse_categorical_crossentropy(y_label, prediction) # Horovod: add Horovod Distributed GradientTape for reduction:=============# tape = hvd.DistributedGradientTape(tape) gradients = tape.gradient(loss, model.trainable_variables) opt.apply_gradients(zip(gradients, model.trainable_variables)) # Horovod: broadcast initial variable states from rank 0 to all other # processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. # Note: broadcast should be done after the first gradient step to ensure # optimizer initialization. if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0)
def train_step(images, labels, first_batch): gradients, loss, predictions = get_grads(images, labels) # Rubik: Accumulate the gradients across microbatches # Horovod: Allreduce the accumulated gradients gradients = [hvd.allreduce(g.accumulate()) for g in gradients] optimizer.apply_gradients(zip(gradients, model.trainable_variables)) # Horovod: Broadcast the variables after first batch if first_batch: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) # Rubik: Average the loss across microbatches train_loss(loss.reduce_mean()) # Rubik: Merge predictions across microbatches train_accuracy(labels, predictions.merge()) return loss.reduce_mean()