def testNoPS(self): p = cluster_factory.Cluster.Params() p.worker.name = '/job:trainer' p.worker.replicas = 1 p.ps.name = '/job:trainer' p.ps.replicas = 1 c = cluster_factory.Cluster(p) g = tf.Graph() vs = [] with g.as_default(): with tf.device(c.GetPlacer()): for i in range(10): vs.append(tf.get_variable('x%d' % i, (10, 10, 10))) sum_all = tf.add_n(vs) for v in vs: self.assertEqual( v.device, cluster.MakeDeviceString(job_name='/job:trainer', task_id=0, device_name='CPU', device_id=0)) self.assertEqual( sum_all.device, cluster.MakeDeviceString(job_name='/job:trainer', task_id=0, device_name='CPU', device_id=0))
def testPSWithGPUs(self): p = cluster_factory.Cluster.Params() p.worker.name = '/job:trainer' p.worker.replicas = 1 p.ps.name = '/job:ps' p.ps.replicas = 4 p.ps.gpus_per_replica = 2 c = cluster_factory.Cluster(p) g = tf.Graph() vs = [] with g.as_default(): with tf.device(c.GetPlacer()): for i in range(10): vs.append(tf.get_variable('x%d' % i, (10, 10, 10))) sum_all = tf.add_n(vs) for i, v in enumerate(vs): self.assertEqual( v.device, cluster.MakeDeviceString(job_name='/job:ps', task_id=(i / 2) % 4, device_name='GPU', device_id=i % 2)) self.assertEqual( sum_all.device, cluster.MakeDeviceString(job_name='/job:trainer', task_id=0, device_name='CPU', device_id=0))
def testDefaultParamsWithDynamicShape(self): p = cluster_factory.Cluster.Params() c = cluster_factory.Cluster(p) g = tf.Graph() vs = [] with g.as_default(): with tf.device(c.GetPlacer()): for i in range(10): dyn_shape = tf.constant([2], dtype=tf.int32) dyn_shape = tf.placeholder_with_default(dyn_shape, shape=[None]) v = tf.get_variable('x%d_wb/var' % i, initializer=tf.random.uniform( dyn_shape, dtype=tf.float64), validate_shape=False) vs.append(v) sum_all = tf.add_n(vs) for v in vs: self.assertEqual( v.device, cluster.MakeDeviceString(job_name='/job:localhost', task_id=0, device_name='CPU', device_id=0)) self.assertEqual( sum_all.device, cluster.MakeDeviceString(job_name='/job:localhost', task_id=0, device_name='CPU', device_id=0))
def FrontendAndEncoderFProp(self, theta, input_batch_src, initial_state=None): """FProps through the frontend and encoder. Args: theta: A NestedMap object containing weights' values of this layer and its children layers. input_batch_src: An input NestedMap as per `BaseAsrFrontend.FProp`. initial_state: None or a NestedMap object containing the initial states. Returns: A NestedMap as from `AsrEncoder.FProp`. """ p = self.params if p.frontend: with tf.name_scope('frontend'): input_batch_src = self.frontend.FProp(theta.frontend, input_batch_src) with layers_with_attention.AuxLossContext() as aux_loss_ctx: if initial_state: encoder_outputs = self.encoder.FProp(theta.encoder, input_batch_src, state0=initial_state) else: encoder_outputs = self.encoder.FProp(theta.encoder, input_batch_src) # get aux loss if there is. if aux_loss_ctx.aux_losses: assert isinstance(aux_loss_ctx.aux_losses, list) assert len(aux_loss_ctx.aux_losses) >= 1 aux_loss = tf.add_n(aux_loss_ctx.aux_losses) encoder_outputs.aux_loss = aux_loss return encoder_outputs
def testDefaultParams(self): p = cluster_factory.Cluster.Params() c = cluster_factory.Cluster(p) self.assertFalse(c.add_summary) g = tf.Graph() vs = [] with g.as_default(): with tf.device(c.GetPlacer()): for i in range(10): vs.append(tf.get_variable('x%d' % i, (10, 10, 10))) sum_all = tf.add_n(vs) for v in vs: self.assertEqual( v.device, c._MakeDeviceString( job_name='/job:localhost', task_id=0, device_name='CPU', device_id=0)) self.assertEqual( sum_all.device, c._MakeDeviceString( job_name='/job:localhost', task_id=0, device_name='CPU', device_id=0))
def testManyHotLabels(self): batch_size = 7 num_classes = 400 num_positive = 5 # To help keep the test simple, we put the positive labels on the # first 'num_positive' classes in every example. labels = np.zeros((batch_size, num_classes), np.float32) labels[:, :num_positive] = 1.0 logits = np.random.uniform(size=labels.shape).astype( np.float32) * 10 + 1e7 losses = label_lib.MultiLabelContrastiveLoss( tf.convert_to_tensor(labels, dtype=tf.float32), tf.convert_to_tensor(logits, dtype=tf.float32)) # Verify that the multi-label loss is equivalent to the average softmax # cross entropy of each positive pair vs. all negative pairs. negative_pair_logits = logits[:, num_positive:] one_vs_all_labels = np.zeros( (batch_size, num_classes - num_positive + 1), np.float32) one_vs_all_labels[:, 0] = 1 expected_loss_terms = [] for i in range(num_positive): one_vs_all_logits = np.concatenate( [logits[:, i:(i + 1)], negative_pair_logits], axis=1) expected_loss_terms.append( tf.nn.softmax_cross_entropy_with_logits( labels=one_vs_all_labels, logits=one_vs_all_logits)) expected_loss = tf.add_n(expected_loss_terms) / num_positive self.assertAllClose(expected_loss, losses)
def Merge(xs): rets = [] for x in zip(*xs): if x[0] is None: rets.append(None) else: rets.append(tf.add_n(list(x))) return tuple(rets)
def testParallelLayer(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(24332) p = layers.ParallelLayer.Params().Set( name='test', merge=lambda xs: tuple([tf.add_n(x) for x in zip(*xs)]), sub=[ lingvo_layers.FCLayer.Params().Set(name='foo', input_dim=32, output_dim=4), lingvo_layers.FCLayer.Params().Set(name='bar', input_dim=32, output_dim=4), layers.SequentialLayer.Params().Set( name='seq', sub=[ lingvo_layers.FCLayer.Params().Set(name='baz', input_dim=32, output_dim=4), lingvo_layers.DropoutLayer.Params().Set( name='dropout', keep_prob=0.5) ]) ]) p.is_eval = True l = p.Instantiate() x = tf.random_normal(shape=[2, 32]) y = l.FPropDefaultTheta(x) with self.session(graph=g) as sess: sess.run(tf.global_variables_initializer()) x_val, y_val, w = sess.run([x, y, l.vars]) out = [] act = x_val # relu(act \dot w + b) out += [np.maximum(0, np.matmul(act, w.foo.w) + w.foo.b)] self.assertEqual(out[-1].shape, (2, 4)) out += [np.maximum(0, np.matmul(act, w.bar.w) + w.bar.b)] self.assertEqual(out[-1].shape, (2, 4)) out += [np.maximum(0, np.matmul(act, w.seq.baz.w) + w.seq.baz.b)] self.assertEqual(out[-1].shape, (2, 4)) np_result = out[0] for v in out[1:]: np_result = np.add(np_result, v) self.assertAllClose(np_result, y_val)
def _update_mask(self, weights, threshold): """Updates the mask for a given weight tensor. This functions first computes the cdf of the weight tensor, and estimates the threshold value such that 'desired_sparsity' fraction of weights have magnitude less than the threshold. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if sparsity is not defined """ if self._sparsity is None: raise ValueError('Sparsity variable undefined') sparsity = self._get_sparsity(weights.op.name) with tf.name_scope(weights.op.name + '_pruning_ops'): abs_weights = tf.abs(weights) k = tf.cast( tf.round( tf.cast(tf.size(abs_weights), tf.float32) * (1 - sparsity)), tf.int32) # Sort the entire array values, _ = tf.nn.top_k(tf.reshape(abs_weights, [-1]), k=tf.size(abs_weights)) # Grab the (k-1) th value current_threshold = tf.gather(values, k - 1) smoothed_threshold = tf.add_n([ tf.multiply(current_threshold, 1 - self._spec.threshold_decay), tf.multiply(threshold, self._spec.threshold_decay) ]) new_mask = tf.cast( tf.greater_equal(abs_weights, smoothed_threshold), tf.float32) return smoothed_threshold, new_mask
def testPSRandomSize(self): p = cluster_factory.Cluster.Params() p.worker.name = '/job:trainer' p.ps.name = '/job:ps' p.ps.replicas = 10 c = cluster_factory.Cluster(p) g = tf.Graph() vs = [] np.random.seed(301) with g.as_default(): with tf.device(c.GetPlacer()): # Creates 200 variables with different sizes. for i in range(200): if i % 13: size = np.random.randint(10000) elif i % 7: size = np.random.randint(100) else: size = np.random.randint(10) vs.append(tf.get_variable('x%d' % i, shape=(size))) sum_all = tf.add_n([tf.reduce_sum(x) for x in vs]) # Computes the total size of variables placed on each device. total_size = {} # device name -> size for v in vs: size = tf.TensorShape(v.op.get_attr('shape')).num_elements() if v.device in total_size: total_size[v.device] += size else: total_size[v.device] = size for (device, allocated) in zip( sorted(total_size), [91701, 91361, 90346, 88738, 87240, 89265, 91944, 92472, 88051, 95053]): self.assertEqual(total_size[device], allocated) self.assertEqual( sum_all.device, cluster.MakeDeviceString( job_name='/job:trainer', replica_id=0, task_id=0, device_name='CPU', device_id=0))
def GradSum(v, *gs): tf.logging.info('GradSum: %s: %s', v, gs) if all(g is None for g in gs): return None return tf.add_n([g for g in gs if g is not None])
def _Gradient(inputs, _, original_grad): # Compute the gradients for each loss w.r.t. the inputs. # TODO(jngiam): Look into whether TF dedups this computation. per_loss_grads = [] for loss, _ in self._losses: per_loss_grad = tf.gradients(loss, self._output_tensor)[0] if per_loss_grad is None: tf.logging.warning( 'Loss %s did not result in a gradient during ' 'GradDrop computation.', loss) else: per_loss_grads.append(per_loss_grad) if not per_loss_grads: raise ValueError('No valid gradients for GradDrop.') # Multiply the gradients with the inputs. grads = per_loss_grads if p.use_input_sign_only: input_abs = tf.abs( tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs) grads = [grad * ((inputs) / (input_abs)) for grad in grads] else: grads = [grad * inputs for grad in grads] # Sum gradient over batch, assuming that batch is always on dim 0. if p.marginalize_batch_dim: grads = [ tf.reduce_sum(grad, axis=0, keepdims=True) for grad in grads ] # First discretize all gradients into their sign values. grad_sign_positive = [ tf.cast(grad > 0.0, tf.float32) for grad in grads ] grad_sign_negative = [ tf.cast(grad < 0.0, tf.float32) for grad in grads ] # Calculate the probability of positive gradients based on equation (1) # in the GradDrop paper. grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads]) prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon)) # Implementation of different scales for the keep function. Larger # scales result in steeper keep functions. prob_pos *= p.keep_prob_function_scale if p.keep_prob_function == 'sigmoid': # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0 # allows the function scale in sigmoid to be compatible with the # function scale in the linear case. prob_pos = tf.sigmoid(4.0 * prob_pos) elif p.keep_prob_function == 'linear': prob_pos += 0.5 # The main, default mode of GradDrop. Only gradients of one sign are kept, # and which sign is calculated via equation (1) of the main paper. prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape), tf.float32) - 0.5 grad_masks = [ (gsp - gsn) * prob_pos >= 0 for (gsn, gsp) in zip(grad_sign_negative, grad_sign_positive) ] # This diag value gives us the percentage of grads which are kept. gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks] diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks)) summary_utils.scalar('average_grad_mask', diag) leak_ratios = [leak_ratio for _, leak_ratio in self._losses] transformed_per_loss_grads = [ grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32)) for (leak, grad, grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks) ] transformed_grad = tf.cast(tf.add_n(transformed_per_loss_grads), original_grad.dtype) if not p.keep_gradnorm_constant: return transformed_grad transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2)) original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2)) return transformed_grad * original_grad_norm / ( transformed_grad_norm + p.epsilon)
def ComputeLoss(self, theta, predictions, input_batch): """Computes loss and other metrics for the given predictions. Args: theta: A `.NestedMap` object containing variable values of this task. predictions: The output of `ComputePredictions`. input_batch: A `.NestedMap` object containing input tensors to this tower. Returns: A tuple (metrics, per_example_tensors), where - `metrics` is a dict of str keys to (metric, weight) values - `per_example_tensors` is a dict of str keys to tensors describing each training example, where the first dimension of each tensor is the batch index. """ p = self.params # During TPU training, collect the encodings and ids from all TPUs so the # loss can be computed over all query-result pairs in the global batch. # To avoid duplicating work, each TPU operates on a non-overlapping # slice of these pairs. Specifically, each TPU uses queries drawn from its # local batch and results from the global batch. # Encodings of the local and global examples, keyed by modality. local_flat_encodings = py_utils.NestedMap({ modality: tf.reshape(predictions[modality].encodings, [-1, p.joint_embedding_dim]) for modality in predictions }) global_flat_encodings = tpu_utils.ConcatenateAcrossReplicas( local_flat_encodings) def _ComputePerQueryLoss(query_modality, result_modality): labeler_inputs = label_lib.ExamplePairs.BetweenLocalAndGlobalBatches( input_batch, query_modality=query_modality, result_modality=result_modality) labels = p.label_fn(labeler_inputs) # [num_queries, num_results] flat_similarities = self.score_function( local_flat_encodings[query_modality], global_flat_encodings[result_modality]) flat_labels = tf.reshape(labels, flat_similarities.shape) # [num_queries] return label_lib.MultiLabelContrastiveLoss( labels=flat_labels, logits=flat_similarities) loss_terms = [] metrics = {} for direction, loss_weight in p.loss_weights.items(): query_modality, result_modality = direction if not loss_weight: logging.info('Skipping %s retrieval', direction) continue per_query_losses = _ComputePerQueryLoss(query_modality, result_modality) mean_per_query_loss = tf.reduce_mean(per_query_losses) loss_terms.append(loss_weight * mean_per_query_loss) metrics['loss_{}_to_{}'.format( query_modality, result_modality)] = (mean_per_query_loss, 1) regularization_losses = utils.CollectRegularizationLosses(self) if p.regularization_loss_weight and regularization_losses: tf.logging.info('Adding TF1 regularization loss: %s', regularization_losses) total_reg_loss = tf.reduce_sum(regularization_losses) loss_terms.append(p.regularization_loss_weight * total_reg_loss) metrics['loss_regularization'] = (total_reg_loss, 1) loss = tf.add_n(loss_terms) metrics['loss'] = (loss, 1) return metrics, {}
def FPropTower(self, theta, input_batch): with layers_with_attention.AuxLossContext() as aux_loss_ctx: assert aux_loss_ctx is not None p = self.params fprop_dtype = py_utils.FPropDtype(p) tf.logging.info('input_batch=%r', input_batch) ids = input_batch.ids labels_ids = input_batch.labels paddings = tf.cast(input_batch.paddings, fprop_dtype) weights = tf.cast(input_batch.weights, fprop_dtype) tf.logging.info('inputs={}'.format( (ids, paddings, labels_ids, weights))) batch_size = tf.shape(ids)[0] state0 = self.lm.zero_state(theta.lm, batch_size) labels = py_utils.NestedMap(class_ids=labels_ids, class_weights=weights) xent_output, _ = self.lm.FProp(theta.lm, ids, paddings, state0, labels, segment_ids=input_batch.segment_ids, segment_pos=input_batch.segment_pos) # +input_batch.num_sentences to account for the end of sequence symbol. num_words = tf.cast( tf.reduce_sum( input_batch.word_count + tf.cast(input_batch.num_sentences, dtype=tf.int32)), fprop_dtype) predicted_labels = tf.cast(xent_output.per_example_argmax, labels_ids.dtype) num_sentences = tf.reduce_sum(input_batch.num_sentences) num_preds = tf.cast(xent_output.total_weight, fprop_dtype) mean_acc = tf.reduce_sum( tf.cast(tf.equal(labels_ids, predicted_labels), fprop_dtype) * weights) / tf.math.maximum(num_preds, 1) avg_xent = xent_output.avg_xent aux_loss_tensors = aux_loss_ctx.aux_losses if aux_loss_tensors: assert isinstance(aux_loss_tensors, list) assert len(aux_loss_tensors) >= 1 # scalar assert p.aux_loss_weight > 0 aux_loss = p.aux_loss_weight * tf.add_n(aux_loss_tensors) else: # scalar aux_loss = tf.zeros_like(avg_xent) loss = avg_xent + aux_loss return { 'loss': (loss, num_preds), 'avg_xent': (avg_xent, num_preds), 'aux_loss': (aux_loss, num_preds), 'fraction_of_correct_next_step_preds': (mean_acc, num_preds), 'log_pplx': (xent_output.avg_xent, num_preds), 'log_pplx_per_word': (xent_output.total_xent / num_words, num_words), 'num_predictions': (num_preds, 1), 'num_words': (num_words, 1), 'num_sentences': (num_sentences, 1) }, {}