def _verify_timestep_counts(self, num_splits): num_micro_batches = 8 batch_size = 16 with self.session(graph=tf.Graph()) as sess: tf.set_random_seed(1245) inputs = tf.random_uniform([batch_size, 8, 8, 1], seed=12345) net = _BuildDummyPipelineCnn(num_splits=num_splits, num_micro_batches=num_micro_batches) endpoints = net.FPropDefaultTheta(inputs) if isinstance(endpoints, (list, tuple)): logits, aux_logits = endpoints else: logits = endpoints aux_logits = None loss = tf.reduce_mean(logits) grads = tf.gradients(loss, tf.trainable_variables()) grad_norm = tf.sqrt(py_utils.SumSquared(grads)) ts = net.GetAccumulatorValues().Flatten() sess.run(tf.global_variables_initializer()) grad_norm_val, ts_vals = sess.run([grad_norm, ts]) test_utils.CompareToGoldenSingleFloat(self, 0.268087, grad_norm_val) # Accumulator values should be equal to number of time steps in pipeline. for ts_val in list(ts_vals): expected_ts = num_micro_batches if num_splits > 1 else 1 self.assertEqual(ts_val, expected_ts) if aux_logits is not None: aux_logit_tensor = sess.run(aux_logits) self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
def testParamValueSumSquared(self): with self.session(use_gpu=False, graph=tf.Graph()): p = self._testParams() mdl = p.Instantiate() mdl.FPropDefaultTheta() all_vars = tf.trainable_variables() py_utils.SumSquared(all_vars)
def AddNormSummary(name, vs_gs): """"Returns and creates summary for norms of vs and their gradients gs. Args: name: A name string for summary. vs_gs: A `.NestedMap` or a list of `.NestedMap` of (variable, gradient). Returns: norm of variables, and norm of gradients. """ flatten = py_utils.NestedMap(child=vs_gs).Flatten() v_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten])) scalar('var_norm/%s' % name, v_norm) g_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten])) scalar('grad_norm/%s' % name, g_norm) return v_norm, g_norm
def AddNormSummary(name, vs_gs): """"Returns and creates summary for norms of vs and their gradients gs. Args: name: A name string for summary. vs_gs: A `.NestedMap` or a list of `.NestedMap` of (variable, gradient). Returns: norm of variables, and norm of gradients. """ flatten = py_utils.Flatten(vs_gs) v_norm = tf.sqrt(py_utils.SumSquared([v for (v, _) in flatten])) g_norm = tf.sqrt(py_utils.SumSquared([g for (_, g) in flatten])) if py_utils.IsEagerMode(): scalar_v2(f'var_norm/{name}', v_norm) scalar_v2(f'grad_norm/{name}', g_norm) else: scalar(f'var_norm/{name}', v_norm) scalar(f'grad_norm/{name}', g_norm) return v_norm, g_norm
def ApplyGradients(self, task_call_scope, feature_to_gradient_dict): """Apply tpu embedding gradient updates. Args: task_call_scope: The current task call scope name. feature_to_gradient_dict: A `py_utils.NestedMap` of: tpu embedding feature name -> gradient tensor for the embedding feature. Returns: The gradient update op and a dict of eval metrics. Raises: ValueError: if gradients have been applied before for the current task. """ # TODO(laigd): we need a way to tell which task needs backprop, and whether # send gradient ops are created for that task. if task_call_scope in self._send_gradient_op_by_task: raise ValueError( f'Send gradient op for task {task_call_scope} already exist.') # Apply gradient multiplier schedule. grad_multiplier = self._gradient_multiplier_schedule.Value() feature_to_gradient_dict = feature_to_gradient_dict.Transform( lambda g: g * grad_multiplier) send_gradient_op = ( self._tpu_embedding.generate_send_gradients_op( feature_to_gradient_dict, step=py_utils.GetGlobalStep())) self._send_gradient_op_by_task[task_call_scope] = send_gradient_op activations = self.GetActivations(task_call_scope).values() eval_metrics = { 'tpu_embedding_activation_norm': (tf.sqrt(py_utils.SumSquared(activations)), tf.constant(1.0)), 'tpu_embedding_grad_norm': (tf.sqrt(py_utils.SumSquared(feature_to_gradient_dict.Flatten())), tf.constant(1.0)), 'tpu_embedding_gradient_multiplier': (grad_multiplier, tf.constant(1.0)), } return send_gradient_op, eval_metrics
def _verify_timestep_counts(self, num_splits, auto_partition=False, micro_batch_size=None): num_micro_batches = 8 batch_size = 16 with self.session(graph=tf.Graph()) as sess: tf.random.set_seed(1245) inputs = tf.random.uniform([batch_size, 8, 8, 1], seed=12345) if auto_partition: layers = [ _SimpyLayer.Params().Set(name='layer_{}'.format(i)) for i in range(16) ] net = PipeliningLayer.Params().Set( name='pipeline', num_micro_batches=num_micro_batches, cell_tpl=_Partition(layers, num_splits, tshape.Shape([batch_size, 8, 8, 1]))).Instantiate() else: net = _BuildDummyPipelineCnn( num_splits=num_splits, micro_batch_size=micro_batch_size, num_micro_batches=num_micro_batches) endpoints = net.FPropDefaultTheta(inputs) if isinstance(endpoints, (list, tuple)): logits, aux_logits = endpoints else: logits = endpoints aux_logits = None loss = tf.reduce_mean(logits) grads = tf.gradients(loss, tf.trainable_variables()) grad_norm = tf.sqrt(py_utils.SumSquared(grads)) ts = net.GetAccumulatorValues().Flatten() sess.run(tf.global_variables_initializer()) grad_norm_val, ts_vals = sess.run([grad_norm, ts]) test_utils.CompareToGoldenSingleFloat(self, 0.268087, grad_norm_val) # Accumulator values should be equal to number of time steps in pipeline. for ts_val in list(ts_vals): expected_ts = num_micro_batches if num_splits > 1 else 1 self.assertEqual(ts_val, expected_ts) if aux_logits is not None: aux_logit_tensor = sess.run(aux_logits) self.assertEqual(aux_logit_tensor.shape, (batch_size, 8, 8, 1))
def ScaleGradients(self, var_grads, gradient_adjuster=None): """Scales gradients according to training params. Args: var_grads: a `.NestedMap` whose values are (var, grad) pairs. gradient_adjuster: if not None, a function that mutates a given var_grads. Returns: A `.NestedMap` containing: - has_nan_or_inf: a scalar of 0 or 1, indicating whether there is any NaN or Inf in input gradients. - final_var_grads: a `.NestedMap` whose values are (var, grad) pairs, where gradients have already been scaled. - grad_scale: the gradient scale. 0 if gradient updates should be skipped for the step. (Optional, only returned in case global norm clipping is used.) """ p = self.params # Computes gradients' norm and adds their summaries. Note that all_grad_norm # may be nan, which may cause grad_scale to be nan. for name, vg in var_grads.FlattenItems(): summary_utils.AddNormSummary(name + '/' + p.name, py_utils.NestedMap(s=vg)) all_grad_norm = tf.sqrt( py_utils.SumSquared([ g for (_, g) in py_utils.NestedMap(child=var_grads).Flatten() ])) all_var_norm = tf.sqrt( py_utils.SumSquared([ v for (v, _) in py_utils.NestedMap(child=var_grads).Flatten() ])) grad_norm_is_nan_or_inf = tf.logical_or(tf.is_nan(all_grad_norm), tf.is_inf(all_grad_norm)) # Optional gradient adjustment. Note that this happens after computing # all_grad_norm. if gradient_adjuster is not None: tf.logging.info('gradient_adjuster=%s', gradient_adjuster) var_grads = gradient_adjuster(var_grads) # Handles NaN/Inf gradients. has_nan_or_inf = py_utils.HasNanOrInfGradient(var_grads) # Grad norm can still be inf even if none of the individual grad is inf. has_nan_or_inf = tf.logical_or(has_nan_or_inf, grad_norm_is_nan_or_inf) return_values = py_utils.NestedMap() if p.clip_gradient_single_norm_to_value: # Currently using both types of clipping simultaneously is unsupported. if p.clip_gradient_norm_to_value: raise ValueError( 'Cannot use clip_gradient_single_norm_to_value=%f and ' 'clip_gradient_norm_to_value=%f.' % (p.clip_gradient_single_norm_to_value, p.clip_gradient_norm_to_value)) final_var_grads = py_utils.ApplyGradNormCliping( var_grads, p.clip_gradient_single_norm_to_value) else: grad_scale = self._GetGlobalGradScale(all_grad_norm, has_nan_or_inf) self._AddEvalMetric('grad_norm/all', all_grad_norm, tf.constant(1.0)) self._AddEvalMetric('var_norm/all', all_var_norm, tf.constant(1.0)) self._AddEvalMetric('grad_scale_all', grad_scale, tf.constant(1.0)) final_var_grads = py_utils.ApplyGradMultiplier( var_grads, grad_scale) return_values.grad_scale = grad_scale return_values.has_nan_or_inf = has_nan_or_inf return_values.final_var_grads = final_var_grads return return_values