def _compute_preconditioned_raw_grad(self, var, partitioned_grads): """Returns preconditioned gradient. Args: var: tf.Variable associated with the gradient. partitioned_grads: Partitioned gradient tensor. Returns: A preconditioned gradient tensor. """ partitioned_preconditioned_grads = [] num_partitions = len(partitioned_grads) for pt_idx, pt_grad in enumerate(partitioned_grads): pt_shape = pt_grad.get_shape() rank = len(pt_shape) preconditioner_exists_for_dim = ( self._preconditioner_available_for_dims(pt_shape)) preconditioner_indices = self._preconditioner_indices(pt_shape) mat_preconditioner_list = [] for i in range(rank): if preconditioner_exists_for_dim[i]: mat_preconditioner_list.append( self.get_slot( var, self._preconditioner_key_for_partition_and_dim( i, pt_idx, num_partitions))) precond_grad = pt_grad if rank == 2 and all(preconditioner_exists_for_dim): # Fast path for speedup. precond_grad = tf.matmul( tf.matmul(mat_preconditioner_list[0], precond_grad), mat_preconditioner_list[1]) else: for i in range(rank): if preconditioner_exists_for_dim[i]: precond_grad = tf.tensordot( precond_grad, mat_preconditioner_list[preconditioner_indices[i]], axes=([0], [0])) else: # if preconditioner is not available we transpose it to # permute the axis for the next preconditioner. precond_grad = tf.transpose(precond_grad, perm=list(range(1, rank)) + [0]) partitioned_preconditioned_grads.append(precond_grad) return TensorPartitioner.reform_tensor( partitioned_preconditioned_grads, self._partitioner_metadata[var].num_splits_per_dim)
def _BPropForVariables(self, vmap): """Constructs the backward graph.""" bprop_variable_filters = self.input_generator.GetBpropVariableFilters() # Only compute the mask if the variable filters are not empty. if bprop_variable_filters != [''] * len(bprop_variable_filters): self._ComputeGradientMask(bprop_variable_filters) train_ops = {} # mapping from op name to op. gradient_mask = None if self._per_input_gradient_mask: # TODO(neerajgaur): Change this to use source_selected from input_batch. onehot = self.input_generator.GetInputSourceOneHot() gradient_mask = { k: tf.tensordot(v, onehot, 1) for k, v in six.iteritems(self._per_input_gradient_mask) } all_losses = [] for optimization in self.learners: loss_name = optimization.params.name metric = self._metrics.get(loss_name, None) if metric is None: raise ValueError('Loss %s not found in metrics %s' % (loss_name, list(self._metrics.keys()))) loss = metric[0] all_losses.append(loss) train_ops['train/%s' % loss_name], eval_metrics = optimization.Apply( loss, vmap, gradient_mask=gradient_mask, gradient_adjuster=self.AdjustGradients) for key, (value, weight) in six.iteritems(eval_metrics): self.AddEvalMetric(key + '/' + loss_name, value, weight) relevant_bn_updates, _ = py_utils.FindRelevantBatchNormUpdates( all_losses, tf.get_collection(py_utils.BATCH_NORM_UPDATES)) train_ops['bn_updates'] = relevant_bn_updates # Get the op to update the weight masks and thresholds train_ops['mask_updates'] = self._GetMaskUpdateOp() # Post training step update. train_ops['post_step'] = self.PostTrainingStepUpdate(self.global_step) with tf.control_dependencies(tf.nest.flatten(train_ops)): true_global_step = py_utils.GetOrCreateGlobalStepVar() with tf.colocate_with(true_global_step): increment_global_steps = tf.assign_add(true_global_step, 1) if self._global_step_var != true_global_step: with tf.colocate_with(self._global_step_var): increment_global_steps = tf.group( increment_global_steps, tf.assign_add(self._global_step_var, 1)) train_ops['global_step'] = increment_global_steps # If we are using Tpu Embeddings, generate the monolithic send # gradient op. tpu_embedding_activations = tf.get_collection( py_utils.TPU_EMBEDDING_ACTIVATIONS) if tpu_embedding_activations: tpu_embedding_activations_dict = tpu_embedding_activations[0] tpu_embedding = tf.get_collection(py_utils.TPU_EMBEDDING)[0] tpu_embedding_send_gradient_op = py_utils.ComputeTpuEmbeddingGradients( self.loss, tpu_embedding_activations_dict, tpu_embedding) train_ops['tpu_embedding'] = tpu_embedding_send_gradient_op for op_name, op in six.iteritems(train_ops): assert op is not None, op_name # TODO(rpang): try to structure _train_op as: # tf.cond(skip_step, <only update skip stats>, <all updates>) # so that we skip all other updates when a step is skipped. self._train_op = tf.group(*tf.nest.flatten(train_ops), name='bprop')
def _BPropGenTrainOps(self, vmap, metrics=None, add_summary=True): """Populates the train_ops dictionary in a backwards pass.""" metrics = metrics or self._metrics bprop_variable_filters = self.input_generator.GetBpropVariableFilters() # Only compute the mask if the variable filters are not empty. if bprop_variable_filters != [''] * len(bprop_variable_filters): self._ComputeGradientMask(bprop_variable_filters) train_ops = {} # mapping from op name to op. gradient_mask = None if self._per_input_gradient_mask: # TODO(neerajgaur): Change this to use source_selected from input_batch. onehot = self.input_generator.GetInputSourceOneHot() gradient_mask = { k: tf.tensordot(v, onehot, 1) for k, v in self._per_input_gradient_mask.items() } all_losses = [] for optimization in self.learners: learner_name = optimization.params.name loss_name = optimization.params.loss_name or learner_name metric = metrics.get(loss_name, None) if metric is None: raise ValueError('Loss %s not found in metrics %s' % (loss_name, list(metrics.keys()))) loss = metric[0] all_losses.append(loss) train_ops['train/%s' % learner_name], eval_metrics = optimization.Apply( loss, vmap, gradient_mask=gradient_mask, gradient_adjuster=self.AdjustGradients) if add_summary: for key, (value, weight) in eval_metrics.items(): self.AddEvalMetric(key + '/' + learner_name, value, weight) relevant_bn_updates, _ = py_utils.FindRelevantBatchNormUpdates( all_losses, tf.get_collection(py_utils.BATCH_NORM_UPDATES)) train_ops['bn_updates'] = relevant_bn_updates var_update_ops = [ tf.group(*tf.nest.flatten(train_ops), name='var_update_ops') ] # Post training step update. with tf.control_dependencies(var_update_ops): post_step_op = self.PostTrainingStepUpdate(self.global_step) train_ops = {} with tf.control_dependencies([post_step_op]): # Get the op to update the weight masks and thresholds mask_update_op = self._GetMaskUpdateOp() train_ops['mask_updates'] = mask_update_op with tf.control_dependencies([mask_update_op]): true_global_step = py_utils.GetOrCreateGlobalStepVar() with tf.ops.colocate_with(true_global_step): increment_global_steps = tf.assign_add(true_global_step, 1) if self._global_step_var != true_global_step: with tf.ops.colocate_with(self._global_step_var): increment_global_steps = tf.group( increment_global_steps, tf.assign_add(self._global_step_var, 1)) train_ops['global_step'] = increment_global_steps # If we are using Tpu Embeddings, generate the monolithic send # gradient op. tpu_embedding_activations = tf.get_collection( py_utils.TPU_EMBEDDING_ACTIVATIONS) if tpu_embedding_activations: tpu_embedding_activations_dict = tpu_embedding_activations[0] tpu_embedding = tf.get_collection(py_utils.TPU_EMBEDDING)[0] tpu_embedding_send_gradient_op = py_utils.ComputeTpuEmbeddingGradients( self.loss, tpu_embedding_activations_dict, tpu_embedding) train_ops['tpu_embedding'] = tpu_embedding_send_gradient_op for op_name, op in train_ops.items(): assert op is not None, op_name return train_ops
def _BPropGenTrainOps(self, vmap, metrics=None, add_summary=True): """Populates the train_ops dictionary in a backwards pass.""" metrics = metrics or self._metrics bprop_variable_filters = self.input_generator.GetBpropVariableFilters() # Only compute the mask if the variable filters are not empty. if bprop_variable_filters != [''] * len(bprop_variable_filters): self._ComputeGradientMask(bprop_variable_filters) train_ops = {} # mapping from op name to op. gradient_mask = None if self._per_input_gradient_mask: # TODO(neerajgaur): Change this to use source_selected from input_batch. onehot = self.input_generator.GetInputSourceOneHot() gradient_mask = { k: tf.tensordot(v, onehot, 1) for k, v in self._per_input_gradient_mask.items() } all_losses = [] for optimization in self.learners: learner_name = optimization.params.name (losses, train_ops['train/%s' % learner_name], eval_metrics) = optimization.Apply( metrics, vmap, gradient_mask=gradient_mask, gradient_adjuster=self.AdjustGradients) all_losses.extend(losses) if add_summary: for key, (value, weight) in eval_metrics.items(): self.AddEvalMetric(key + '/' + learner_name, value, weight) relevant_bn_updates, _ = py_utils.FindRelevantBatchNormUpdates( all_losses, tf.get_collection(py_utils.BATCH_NORM_UPDATES)) train_ops['bn_updates'] = relevant_bn_updates var_update_ops = [ tf.group(*tf.nest.flatten(train_ops), name='var_update_ops') ] # Post training step update. with tf.control_dependencies(var_update_ops): post_step_op = self.PostTrainingStepUpdate() train_ops = {} with tf.control_dependencies([post_step_op]): # Get the op to update the weight masks and thresholds mask_update_op = self._GetMaskUpdateOp() train_ops['mask_updates'] = mask_update_op with tf.control_dependencies([mask_update_op]): true_global_step = py_utils.GetOrCreateGlobalStepVar() with tf.ops.colocate_with(true_global_step): if self.params.defer_global_step_update: increment_global_steps = true_global_step else: increment_global_steps = tf.assign_add(true_global_step, 1) if self._global_step_var != true_global_step: with tf.ops.colocate_with(self._global_step_var): increment_global_steps = tf.group( increment_global_steps, tf.assign_add(self._global_step_var, 1)) train_ops['global_step'] = increment_global_steps # If we are using Tpu Embeddings, generate the monolithic send # gradient op. if tf.get_collection(py_utils.TPU_EMBEDDING): tpu_embedding = tf.get_collection(py_utils.TPU_EMBEDDING)[0] sparse_grads = ( tpu_embedding_gradient.get_gradients_through_dummy_table_variables( tpu_embedding)) tpu_embedding_send_gradient_op = tpu_embedding.generate_send_gradients_op( sparse_grads, py_utils.GetGlobalStep()) train_ops['tpu_embedding'] = tpu_embedding_send_gradient_op tpu_embedding_summary_tensors = tf.get_collection( py_utils.TPU_EMBEDDING_SUMMARY_TENSORS) if add_summary: for name, value, weight in tpu_embedding_summary_tensors: self.AddEvalMetric(name, value, weight, raise_if_already_added=False) for op_name, op in train_ops.items(): assert op is not None, op_name return train_ops