def _clip_gradients(self, grads): if not self._gradients_clip_option: return grads clipped_grads = [] if self._gradients_clip_option.clipnorm: for g in grads: if g is None: clipped_grads.append(g) else: clipped_grads.append( tf.clip_by_norm(g, self._gradients_clip_option.clipnorm)) return clipped_grads if self._gradients_clip_option.global_clipnorm: return tf.clip_by_global_norm( grads, self._gradients_clip_option.global_clipnorm)[0] if self._gradients_clip_option.clipvalue: for g in grads: if g is None: clipped_grads.append(g) else: clipped_grads.append( tf.clip_by_value( g, clip_value_min=( -self._gradients_clip_option.clipvalue), clip_value_max=self._gradients_clip_option. clipvalue)) return clipped_grads return grads
def _clip_gradients(self, grads): clipped_grads = [] if self.clipnorm and self.clipnorm > 0: for g in grads: if g is None: clipped_grads.append(g) else: clipped_grads.append(tf.clip_by_norm(g, self.clipnorm)) return clipped_grads if self.global_clipnorm and self.global_clipnorm > 0: return tf.clip_by_global_norm(grads, self.global_clipnorm)[0] if self.clipvalue and self.clipvalue > 0: for g in grads: if g is None: clipped_grads.append(g) else: clipped_grads.append( tf.clip_by_value( g, clip_value_min=-self.clipvalue, clip_value_max=self.clipvalue, )) return clipped_grads return grads
def _clip_gradients(self, grads): clipped_grads = [] if self.clipnorm and self.clipnorm > 0: for g in grads: if g is None: clipped_grads.append(g) else: clipped_grads.append(tf.clip_by_norm(g, self.clipnorm)) return clipped_grads if self.global_clipnorm and self.global_clipnorm > 0: return tf.clip_by_global_norm(grads, self.global_clipnorm)[0] if self.clipvalue and self.clipvalue > 0: for g in grads: if g is None: clipped_grads.append(g) else: clipped_grads.append( tf.clip_by_value( g, clip_value_min=-self.clipvalue, # pylint: disable=invalid-unary-operand-type clip_value_max=self.clipvalue)) return clipped_grads return grads
def step_fn(self, batch): """Per-Replica training step.""" with tf.GradientTape() as tape: _, losses = self.model(batch, return_losses=True, training=True) # Clip and apply gradients. grads = tape.gradient(losses['total_loss'], self.model.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, self.grad_clip_norm) self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables)) return losses
def step_fn(batch): """Per-Replica training step.""" with tf.GradientTape() as tape: _ = self.model(batch, training=True) total_loss = tf.reduce_sum(self.model.losses) grads = tape.gradient(total_loss, self.model.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, self.grad_clip_norm) self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables)) return self.model.losses_dict
def gradient_clipnorm_fn(grads_and_vars): if isinstance(tf.distribute.get_strategy(), (tf.distribute.experimental.CentralStorageStrategy, tf.compat.v1.distribute.experimental.CentralStorageStrategy)): raise ValueError( "`global_clipnorm` is not supported with `CenteralStorageStrategy`") grads, variables = zip(*grads_and_vars) clipped_grads, _ = tf.clip_by_global_norm(grads, clipnorm) clipped_grads_and_vars = list(zip(clipped_grads, variables)) return clipped_grads_and_vars
def _get_gradients(self): """Calculate and apply gradients for this step""" with tf.GradientTape() as tape: normalized_nll = self._normalized_nll() grads = tape.gradient(normalized_nll, self.trainables) if self.grad_clip: grads, _ = tf.clip_by_global_norm(grads, self.grad_clip) self.optimizer.apply_gradients(zip(grads, self.trainables)) return normalized_nll, grads, tf.math.abs(tf.reduce_max(grads))
def train_step(self, target_audio, f0, amp): f0 = tf.Variable(f0, dtype=tf.float32) amp = tf.Variable(amp, dtype=tf.float32) synth_audio = self.synth(f0, amp) target_mag = self.spec_layer(target_audio)[:,:-100,:,:] true_synth_mag = self.spec_layer(synth_audio)[:,:-100,:,:] true_loss = self.get_loss(target_mag, true_synth_mag) with tf.GradientTape() as tape: estimated_synth_mag = self.estimate_spec(f0, amp)[:,:-100,:,:] estimated_loss = self.get_loss(target_mag, estimated_synth_mag) end2end = tf.reduce_mean(tf.square(true_loss - estimated_loss)) spec_mse = tf.reduce_mean(tf.square(target_mag - estimated_synth_mag)) J = end2end + spec_mse * 50*0.9**self.optimizer.iterations.numpy() grads = tape.gradient(J, self.trainable_variables) grads, _ = tf.clip_by_global_norm(grads, 0.1) self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
def update_model(self, x_pos, x_neg, t): """ Update the model parameters in a iteration """ with tf.GradientTape() as tape: tape.watch(self.diffusion.trainable_variables) loss, loss_ts, f_ts = self.diffusion.training_losses(x_pos, x_neg, t, dropout=self.hps.dropout) vars = self.diffusion.trainable_variables grads = tape.gradient(loss, vars) if self.hps.grad_clip: grads, gnorm = tf.clip_by_global_norm(grads, 1. / float(num_device()[0])) grads_and_vars = list(zip(grads, vars)) grads_mean = tf.reduce_mean(tf.stack([tf.reduce_mean(tf.abs(grad)) for grad in grads], axis=0)) grads_max = tf.reduce_max(tf.stack([tf.reduce_max(tf.abs(grad)) for grad in grads], axis=0)) self.opt.apply_gradients(grads_and_vars) self.ema.apply(self.diffusion) return loss, grads_mean, grads_max, loss_ts, f_ts
def _step(self, trajectory: sequence.Trajectory): """Do a batch of SGD on actor + critic loss on a sequence of experience.""" observations, actions, rewards, discounts = trajectory # Add dummy batch dimensions. actions = tf.expand_dims(actions, axis=-1) # [T, 1] rewards = tf.expand_dims(rewards, axis=-1) # [T, 1] discounts = tf.expand_dims(discounts, axis=-1) # [T, 1] observations = tf.expand_dims(observations, axis=1) # [T+1, 1, ...] # Extract final observation for bootstrapping. observations, final_observation = observations[:-1], observations[-1] with tf.GradientTape() as tape: # Build actor and critic losses. (logits, values), state = snt.dynamic_unroll(self._network, observations, self._rollout_initial_state) (_, bootstrap_value), state = self._network(final_observation, state) values = tf.squeeze(values, axis=-1) bootstrap_value = tf.squeeze(bootstrap_value, axis=-1) critic_loss, (advantages, _) = trfl.td_lambda( state_values=values, rewards=rewards, pcontinues=self._discount * discounts, bootstrap_value=bootstrap_value, lambda_=self._td_lambda) actor_loss = trfl.discrete_policy_gradient_loss( logits, actions, advantages) entropy_loss = trfl.discrete_policy_entropy_loss(logits).loss loss = actor_loss + critic_loss + self._entropy_cost * entropy_loss loss = tf.reduce_mean(loss) gradients = tape.gradient(loss, self._network.trainable_variables) gradients, _ = tf.clip_by_global_norm(gradients, 5.) self._optimizer.apply(gradients, self._network.trainable_variables) return state
def _step(self, sequence: Sequence[tf.Tensor]): """Do a batch of SGD on actor + critic loss on a sequence of experience.""" (observations, actions, rewards, discounts, masks, final_obs, final_mask) = sequence masks = tf.expand_dims(masks, axis=-1) with tf.GradientTape() as tape: # Build actor and critic losses. state = self._rollout_initial_state logits_sequence = [] values = [] for t in range(self._sequence_length): (logits, value), state = self._network( (observations[t], masks[t]), state) logits_sequence.append(logits) values.append(value) (_, bootstrap_value), _ = self._network((final_obs, final_mask), state) values = tf.squeeze(tf.stack(values, axis=0), axis=-1) logits = tf.stack(logits_sequence, axis=0) bootstrap_value = tf.squeeze(bootstrap_value, axis=-1) critic_loss, (advantages, _) = trfl.td_lambda( state_values=values, rewards=rewards, pcontinues=self._discount * discounts, bootstrap_value=bootstrap_value, lambda_=self._td_lambda) actor_loss = trfl.discrete_policy_gradient_loss( logits, actions, advantages) loss = tf.reduce_mean(actor_loss + critic_loss) gradients = tape.gradient(loss, self._network.trainable_variables) gradients, _ = tf.clip_by_global_norm(gradients, 5.) self._optimizer.apply(gradients, self._network.trainable_variables) return state
def _compiled_local_step(inputs, labels, training_vars, accum_vars): """Replicated training step.""" with tf.GradientTape() as tape: model_outputs, metric_outputs = model(inputs, training=True) loss = loss_fn(labels, model_outputs) if isinstance(optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer): with tape: scaled_loss = optimizer.get_scaled_loss(loss) scaled_grads = tape.gradient(scaled_loss, training_vars) grads = optimizer.get_unscaled_gradients(scaled_grads) else: grads = tape.gradient(loss, training_vars) (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) if accum_vars is None: return grads, loss, model_outputs, metric_outputs else: new_accum_vars = [] for i, grad in enumerate(grads): new_accum_vars.append( accum_vars[i] + tf.math.scalar_mul(1.0 / num_accumulation_steps, grad)) return new_accum_vars, loss, model_outputs, metric_outputs
def grad(dy): # NOTE: Must return a gradient for all inputs to `clip_gradient`. return tf.clip_by_global_norm([dy], clip_norm)[0][0], tf.constant(0.)
def eager_train_step(detection_model, features, labels, unpad_groundtruth_tensors, optimizer, learning_rate, add_regularization_loss=True, clip_gradients_value=None, global_step=None, num_replicas=1.0): """Process a single training batch. This method computes the loss for the model on a single training batch, while tracking the gradients with a gradient tape. It then updates the model variables with the optimizer, clipping the gradients if clip_gradients_value is present. This method can run eagerly or inside a tf.function. Args: detection_model: A DetectionModel (based on Keras) to train. features: Dictionary of feature tensors from the input dataset. Should be in the format output by `inputs.train_input. features[fields.InputDataFields.image] is a [batch_size, H, W, C] float32 tensor with preprocessed images. features[HASH_KEY] is a [batch_size] int32 tensor representing unique identifiers for the images. features[fields.InputDataFields.true_image_shape] is a [batch_size, 3] int32 tensor representing the true image shapes, as preprocessed images could be padded. features[fields.InputDataFields.original_image] (optional, not used during training) is a [batch_size, H, W, C] float32 tensor with original images. labels: A dictionary of groundtruth tensors. This method unstacks these labels using model_lib.unstack_batch. The stacked labels are of the form returned by `inputs.train_input` and `inputs.eval_input`. labels[fields.InputDataFields.num_groundtruth_boxes] is a [batch_size] int32 tensor indicating the number of valid groundtruth boxes per image. labels[fields.InputDataFields.groundtruth_boxes] is a [batch_size, num_boxes, 4] float32 tensor containing the corners of the groundtruth boxes. labels[fields.InputDataFields.groundtruth_classes] is a [batch_size, num_boxes, num_classes] float32 one-hot tensor of classes. num_classes includes the background class. labels[fields.InputDataFields.groundtruth_weights] is a [batch_size, num_boxes] float32 tensor containing groundtruth weights for the boxes. -- Optional -- labels[fields.InputDataFields.groundtruth_instance_masks] is a [batch_size, num_boxes, H, W] float32 tensor containing only binary values, which represent instance masks for objects. labels[fields.InputDataFields.groundtruth_keypoints] is a [batch_size, num_boxes, num_keypoints, 2] float32 tensor containing keypoints for each box. labels[fields.InputDataFields.groundtruth_dp_num_points] is a [batch_size, num_boxes] int32 tensor with the number of DensePose sampled points per instance. labels[fields.InputDataFields.groundtruth_dp_part_ids] is a [batch_size, num_boxes, max_sampled_points] int32 tensor with the part ids (0-indexed) for each instance. labels[fields.InputDataFields.groundtruth_dp_surface_coords] is a [batch_size, num_boxes, max_sampled_points, 4] float32 tensor with the surface coordinates for each point. Each surface coordinate is of the form (y, x, v, u) where (y, x) are normalized image locations and (v, u) are part-relative normalized surface coordinates. labels[fields.InputDataFields.groundtruth_labeled_classes] is a float32 k-hot tensor of classes. labels[fields.InputDataFields.groundtruth_track_ids] is a int32 tensor of track IDs. labels[fields.InputDataFields.groundtruth_keypoint_depths] is a float32 tensor containing keypoint depths information. labels[fields.InputDataFields.groundtruth_keypoint_depth_weights] is a float32 tensor containing the weights of the keypoint depth feature. unpad_groundtruth_tensors: A parameter passed to unstack_batch. optimizer: The training optimizer that will update the variables. learning_rate: The learning rate tensor for the current training step. This is used only for TensorBoard logging purposes, it does not affect model training. add_regularization_loss: Whether or not to include the model's regularization loss in the losses dictionary. clip_gradients_value: If this is present, clip the gradients global norm at this value using `tf.clip_by_global_norm`. global_step: The current training step. Used for TensorBoard logging purposes. This step is not updated by this function and must be incremented separately. num_replicas: The number of replicas in the current distribution strategy. This is used to scale the total loss so that training in a distribution strategy works correctly. Returns: The total loss observed at this training step """ # """Execute a single training step in the TF v2 style loop.""" is_training = True detection_model._is_training = is_training # pylint: disable=protected-access tf.keras.backend.set_learning_phase(is_training) labels = model_lib.unstack_batch( labels, unpad_groundtruth_tensors=unpad_groundtruth_tensors) with tf.GradientTape() as tape: losses_dict, _ = _compute_losses_and_predictions_dicts( detection_model, features, labels, add_regularization_loss) total_loss = losses_dict['Loss/total_loss'] # Normalize loss for num replicas total_loss = tf.math.divide( total_loss, tf.constant(num_replicas, dtype=tf.float32)) losses_dict['Loss/normalized_total_loss'] = total_loss for loss_type in losses_dict: tf.compat.v2.summary.scalar(loss_type, losses_dict[loss_type], step=global_step) trainable_variables = detection_model.trainable_variables gradients = tape.gradient(total_loss, trainable_variables) if clip_gradients_value: gradients, _ = tf.clip_by_global_norm(gradients, clip_gradients_value) optimizer.apply_gradients(zip(gradients, trainable_variables)) tf.compat.v2.summary.scalar('learning_rate', learning_rate, step=global_step) tf.compat.v2.summary.image(name='train_input_images', step=global_step, data=features[fields.InputDataFields.image], max_outputs=3) return total_loss
def _train_op_fn(loss, optimizer_fn, l2_regularization=-1, gradient_max_norm=-1, use_synchronous_optimizer=False): """Returns the op to optimize the loss. Supports l2 regularization, learning rate decay and gradient clipping. Args: loss: The training loss before regularization. optimizer_fn: the optimization function. l2_regularization: a float that will multiply the l2 weight norms in the loss function. gradient_max_norm: a float - maximal gradient update allowed. use_synchronous_optimizer: a bool whether to use synchronous optimization. Returns: `ModelSpec` with logits, loss, train_ops and train_hooks. """ total_loss = loss if l2_regularization > 0: weight_losses = [ tf.multiply(tf.nn.l2_loss(weight), l2_regularization, name="l2_weight_loss") for weight in tf.compat.v1.trainable_variables() ] total_loss = tf.add_n(weight_losses + [loss], name="total_loss") global_step = tf.compat.v1.train.get_or_create_global_step() opt = optimizer_fn() train_hooks = [] if use_synchronous_optimizer: config = tf.estimator.RunConfig() workers = config.num_worker_replicas + 1 tolerance = _compute_tolerance(workers) to_aggregate = workers - tolerance opt = tf.compat.v1.train.SyncReplicasOptimizer( opt, replicas_to_aggregate=to_aggregate, total_num_replicas=workers) sync_replicas_hook = opt.make_session_run_hook(config.is_chief) train_hooks.append(sync_replicas_hook) tvars = tf.compat.v1.trainable_variables() grads_and_vars = opt.compute_gradients(loss=total_loss, var_list=tvars) # TODO(b/172564129): switch to tf.contrib.estimator.clip_gradients_by_norm if gradient_max_norm > 0.0: grads = [gv[0] for gv in grads_and_vars] tvars = [gv[1] for gv in grads_and_vars] grads, _ = tf.clip_by_global_norm(grads, gradient_max_norm) grads_and_vars = list(zip(grads, tvars)) if use_synchronous_optimizer: apply_gradients_op = opt.apply_gradients(grads_and_vars, global_step) else: apply_gradients_op = opt.apply_gradients(grads_and_vars) update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): return tf.group(apply_gradients_op), train_hooks
def _step(self) -> Dict[str, tf.Tensor]: # Get data from replay (dropping extras if any). Note there is no # extra data here because we do not insert any into Reverb. sample = next(self._iterator) o_tm1, a_tm1, r_t, d_t, o_t = sample.data[:5] # Cast the additional discount to match the environment discount dtype. discount = tf.cast(self._discount, dtype=d_t.dtype) q_t = self._target_critic_network(o_t, self._policy_network(o_t)) if not self._distributional and self._vmin is not None: q_t = tf.clip_by_value(q_t, self._vmin, self._vmax) logging.info('Clip target critic network output with [%f, %f]', self._vmin, self._vmax) with tf.GradientTape() as tape: # Critic learning. q_tm1 = self._critic_network(o_tm1, a_tm1) # Critic loss. if self._distributional: critic_loss = losses.categorical(q_tm1, r_t, discount * d_t, q_t) else: # Squeeze into the shape expected by the td_learning implementation. q_tm1 = tf.squeeze(q_tm1, axis=-1) # [B] q_t = tf.squeeze(q_t, axis=-1) # [B] critic_loss = trfl.td_learning(q_tm1, r_t, discount * d_t, q_t).loss critic_loss = tf.reduce_mean(critic_loss, axis=[0]) # Get trainable variables. critic_variables = self._critic_network.trainable_variables # Compute gradients. critic_gradients = tape.gradient(critic_loss, critic_variables) # Maybe clip gradients. if self._clipping: critic_gradients = tf.clip_by_global_norm(critic_gradients, 40.)[0] # Apply gradients. self._critic_optimizer.apply(critic_gradients, critic_variables) source_variables = self._critic_network.variables target_variables = self._target_critic_network.variables # Make online -> target network update ops. if tf.math.mod(self._num_steps, self._target_update_period) == 0: for src, dest in zip(source_variables, target_variables): dest.assign(src) if self._init_observations is not None: if tf.math.mod(self._num_steps, 100) == 0: # init_obs = tf.convert_to_tensor(self._init_observations, tf.float32) init_obs = tree.map_structure(tf.convert_to_tensor, self._init_observations) init_actions = self._policy_network(init_obs) init_critic = tf.reduce_mean(self._critic_mean(init_obs, init_actions)) else: init_critic = tf.constant(0.) else: init_critic = tf.constant(0.) self._num_steps.assign_add(1) # Losses to track. return { 'critic_loss': critic_loss, 'q_s0': init_critic, }
def step_fn_d(self, batch): outputs = self.model(batch) d_losses, grads = self.model.discriminator_step_fn(outputs) grads, _ = tf.clip_by_global_norm(grads, self.grad_clip_norm) self.d_optimizer.apply_gradients(zip(grads, self.model.discriminator_variables)) return d_losses
def step_fn_g(self, batch): """Per-Replica training step.""" outputs, losses, grads = self.model.step_fn(batch) grads, _ = tf.clip_by_global_norm(grads, self.grad_clip_norm) self.optimizer.apply_gradients(zip(grads, self.model.generator_variables)) return losses