def DistanceBetweenCentroidsAndBBoxesFastAndFurious(centroids, bboxes, masks): """Computes the distance between centroids and bboxes. The distance/loss is loosely following the 'Fast and Furious' paper by Luo et al., CVPR'18. This is just one way of calculating the distances. We will probably develop other ways. Args: centroids: [..., 4]. x/y/w/h for bboxes. bboxes: [..., 4]. ymin/xmin/ymax/xmax for bboxes. masks: [...]. masks[i] == 1 means i-th entry (centroids[i] and bboxes[i]) should be considered in the distance/loss calculation. Returns: A [...] tensor. i-th value is the distance measure of centroids[i] and bboxes[i]. """ x, y, w, h = tf.unstack(centroids, axis=-1, num=4) # "gt" suffix means 'ground truth'. x_gt, y_gt, w_gt, h_gt = tf.unstack(BBoxesToXYWH(bboxes), axis=-1, num=4) def Pos(x): return tf.maximum(tf.constant(1e-8, x.dtype), x) # The following terms are zeros when masks[i] is 0. l_x = py_utils.CheckNumerics(masks * (x - x_gt) / Pos(w_gt)) l_y = py_utils.CheckNumerics(masks * (y - y_gt) / Pos(h_gt)) s_w = py_utils.CheckNumerics(masks * tf.log(Pos(w) / Pos(w_gt))) s_h = py_utils.CheckNumerics(masks * tf.log(Pos(h) / Pos(h_gt))) return (_SmoothL1Norm(l_x) + _SmoothL1Norm(l_y) + _SmoothL1Norm(s_w) + _SmoothL1Norm(s_h))
def _testHelper(self, base_frnn_p, frnn_p, packed_input=False): inputs, padding, m0, c0, segment_id = self._GetTestInputs(packed_input) base_frnn = base_frnn_p.Instantiate() frnn = frnn_p.Instantiate() with self.session() as sess: tf.global_variables_initializer().run() state0 = py_utils.NestedMap(m=m0, c=c0) act, state = base_frnn.FPropDefaultTheta(inputs, padding, state0=state0, segment_id=segment_id) # Compute grads loss = -tf.log( tf.sigmoid((tf.reduce_sum(tf.math.square(act)) + tf.reduce_sum(state.m * state.c * state.c)))) grads = tf.gradients(loss, base_frnn.vars.Flatten()) expected_act, expected_state, expected_grads = sess.run( [act, state, grads]) act, state = frnn.FPropDefaultTheta(inputs, padding, state0=state0, segment_id=segment_id) # Compute grads loss = -tf.log( tf.sigmoid((tf.reduce_sum(tf.math.square(act)) + tf.reduce_sum(state.m * state.c * state.c)))) grads = tf.gradients(loss, frnn.vars.Flatten()) actual_act, actual_state, actual_grads = sess.run( [act, state, grads]) tf.logging.info('expected_act:{}'.format(expected_act)) tf.logging.info('actual_act:{}'.format(actual_act)) tf.logging.info('expected_state:{}'.format(expected_state)) tf.logging.info('actual_state:{}'.format(actual_state)) tf.logging.info('expected_grads:{}'.format(expected_grads)) tf.logging.info('actual_grads:{}'.format(actual_grads)) self.assertAllClose(expected_act, actual_act) self.assertAllClose(expected_state.m, actual_state.m) self.assertAllClose(expected_state.c, actual_state.c) for (vname, _), expected, actual in zip(frnn.vars.FlattenItems(), expected_grads, actual_grads): self.assertAllClose(expected, actual, msg=vname)
def testScaleGradientsCheckNumerics(self): """ScaleGradients when enable_check_numerics=True.""" FLAGS.enable_check_numerics = True p = self.TestParams() p.input = base_input_generator.BaseSequenceInputGenerator.Params() task = p.Instantiate() task.CreateVariable( 'a', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0))) var_a = task.theta.a # Make a NaN gradient. var_grads = py_utils.NestedMap(a=(var_a, 0. * tf.log(0.))) scaled_grads_map = task.learners[0].ScaleGradients(var_grads) with self.session(): tf.global_variables_initializer().run() with self.assertRaisesRegexp(tf.errors.InvalidArgumentError, 'is not finite'): self.assertTrue(scaled_grads_map.has_nan_or_inf.eval()) self.assertEqual(0., scaled_grads_map.grad_scale.eval()) # The final gradient must be finite. self.assertFalse( tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval()) self.assertTrue( tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
def AddAttentionSummaryBatchMajor(attention_tensors, src_paddings, tgt_paddings, transcripts=None, max_outputs=3): """Adds an image summary showing the attention probability matrix and state. As opposed to AddAttentionSummary() takes all tensors with batch dimension in axis 0. Args: attention_tensors: A list of 3D tensors shaped [batch_size, target_len, source_len] where attention[b, i, j] is the probability for the i-th output attending to the j-th input for element b in the batch. src_paddings: A tensor of binary paddings shaped [batch, source_len] for the source sequence. tgt_paddings: A tensor of binary paddings shaped [batch, target_len] for the target sequence. transcripts: Optional, transcripts shaped [batch, source_len] for the source sequence. max_outputs: Integer maximum number of elements of the batch to plot. """ name = attention_tensors[0].name + '/Attention' if not _ShouldAddSummary(): return with plot.MatplotlibFigureSummary(name, max_outputs=max_outputs) as fig: src_lens = SequenceLength(src_paddings) tgt_lens = SequenceLength(tgt_paddings) for n, atten in enumerate(attention_tensors): # Diagnostic metric that decreases as attention picks up. max_entropy = tf.log(tf.cast(src_lens, tf.float32)) max_entropy = tf.expand_dims(tf.expand_dims(max_entropy, -1), -1) atten_normalized_entropy = -atten * tf.log(atten + 1e-10) / max_entropy scalar('Attention/average_normalized_entropy/%d' % n, tf.reduce_mean(atten_normalized_entropy)) args = [atten, src_lens, tgt_lens] if transcripts is not None and n == 0: args.append(transcripts) fig.AddSubplot(args, TrimPaddingAndPlotAttention, title=atten.name, xlabel='Input', ylabel='Output')
def ApplyBias(): """Bias and update log_probs and consistent.""" def TileForBeamAndFlatten(tensor): tensor = tf.reshape(tensor, [1, -1]) # [1, src_batch] tensor = tf.tile(tensor, [num_hyps_per_beam, 1 ]) # [num_hyps_per_beam, src_batch] tgt_batch = tf.shape(step_ids)[ 0] # num_hyps_per_beam*src_batch return tf.reshape(tensor, [tgt_batch]) # Consistent if step_ids == labels from previous step # TODO(navari): Consider updating consistent only if weights > 0. Then # re-evaluate the need for bias_only_if_consistent=True. # Note that prev_label is incorrrect for step 0 but is overridden later prev_label = TileForBeamAndFlatten( tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1)) is_step0 = tf.equal(time_step, 0) local_consistence = tf.logical_or( is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1))) consistent = tf.logical_and(states.consistent, local_consistence) # get label, weight slices corresponding to current time_step label = TileForBeamAndFlatten( tf.gather(labels, time_step, axis=1)) weight = TileForBeamAndFlatten( tf.gather(weights, time_step, axis=1)) if p.bias_only_if_consistent: weight = weight * tf.cast(consistent, p.dtype) # convert from dense label to sparse label probs vocab_size = tf.shape(bs_results.log_probs)[1] uncertainty = tf.constant( 1e-10, p.dtype) # avoid 0 probs which may cause issues with log label_probs = tf.one_hot( label, vocab_size, on_value=1 - uncertainty, off_value=uncertainty / tf.cast(vocab_size - 1, p.dtype), dtype=p.dtype) # [tgt_batch, vocab_size] pred_probs = tf.exp(bs_results.log_probs) # interpolate predicted probs and label probs weight = tf.expand_dims(weight, 1) probs = py_utils.with_dependencies([ py_utils.assert_less_equal(weight, 1.), py_utils.assert_greater_equal(weight, 0.) ], (1.0 - weight) * pred_probs + weight * label_probs) return tf.log(probs), consistent
def testScaleGradientsInf(self): FLAGS.enable_check_numerics = False p = self.TestParams() p.input = base_input_generator.BaseSequenceInputGenerator.Params() task = p.Instantiate() task.CreateVariable( 'a', py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0))) var_a = task.theta.a # Infinite gradient. var_grads = py_utils.NestedMap(a=py_utils.VarGrad(var_a, tf.log(0.))) scaled_grads_map = task.learners[0].ScaleGradients(var_grads) with self.session(): tf.global_variables_initializer().run() self.assertEqual(0., scaled_grads_map.grad_scale.eval()) # The final gradient must be finite. self.assertFalse(tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval()) self.assertTrue( tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
def _FPropChunk(self, theta, pcm_audio_chunk, pcm_audio_paddings): p = self.params pcm_audio_chunk = tf.cast(pcm_audio_chunk, tf.float32) # shape: [batch, time, _frame_size] framed_signal = tf.signal.frame(pcm_audio_chunk, self._frame_size, self._frame_step, p.pad_end) # Pre-emphasis. if p.preemph != 1.0: preemphasized = self._ApplyPreemphasis(framed_signal) else: preemphasized = framed_signal[:-1] # Noise. if p.noise_scale > 0.0: noise_signal = tf.random_normal( tf.shape(preemphasized), stddev=p.noise_scale, mean=0.0, seed=p.random_seed) else: noise_signal = 0.0 # Apply window fn. windowed_signal = preemphasized + noise_signal if self._window_fn is not None: window = self._window_fn(self._frame_size - 1, framed_signal.dtype) windowed_signal *= window mel_spectrogram = self._MelSpectrogram(windowed_signal) output_floor = 1.0 mel_spectrogram_log = tf.log( tf.maximum(float(output_floor), mel_spectrogram)) # Mean and stddev. mel_spectrogram_norm = ( (mel_spectrogram_log - tf.convert_to_tensor(p.per_bin_mean)) / tf.convert_to_tensor(p.per_bin_stddev)) return mel_spectrogram_norm, self._GetMelPadding(pcm_audio_paddings)
def LocalizationResiduals(self, anchor_bboxes, assigned_gt_bboxes): """Computes the anchor residuals for every bbox. For a given bbox, compute residuals in the following way: Let ``anchor_bbox = (x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a)`` and ``assigned_gt_bbox = (x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt)`` Define ``diagonal_xy = sqrt(dx_a^2 + dy_a^2)`` Then the corresponding residuals are given by:: x_residual = (x_gt - x_a) / (diagonal_xy) y_residual = (y_gt - y_a) / (diagonal_xy) z_residual = (z_gt - z_a) / (dz_a) dx_residual = log(dx_gt / dx_a) dy_residual = log(dy_gt / dy_a) dz_residual = log(dz_gt / dz_a) phi_residual = phi_gt - phi_a The normalization for x and y residuals by the diagonal was first proposed by [1]. Intuitively, this reflects that objects can usually move freely in the x-y plane, including diagonally. On the other hand, moving in the z-axis (up and down) can be considered orthogonal to x-y. For phi_residual, one way to frame the loss is with SmoothL1(sine(phi_residual - phi_predicted)). The use of sine to wrap the phi residual was proposed by [2]. This stems from the observation that bboxes at phi and phi + pi are the same bbox, fully overlapping in 3D space, except that the direction is different. Note that the use of sine makes this residual invariant to direction when a symmetric loss like SmoothL1 is used. In ResidualsToBBoxes, we ensure that the phi predicted is between [0, pi). The Huber (SmoothL1) loss can then be applied to the delta between these target residuals and the model predicted residuals. [1] VoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detection https://arxiv.org/abs/1711.06396 [2] SECOND: Sparsely Embedded Convolutional Detection https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf Args: anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz, phi), corresponding to each anchor bbox parameters. assigned_gt_bboxes: tf.float32 of the same shape as anchor_bboxes containing the corresponding assigned ground-truth bboxes. Returns: A tf.float32 tensor of the same shape as anchor_bboxes with target residuals for every corresponding bbox. """ anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes) anchor_bboxes = py_utils.with_dependencies( [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes) assigned_gt_bboxes = py_utils.HasShape(assigned_gt_bboxes, anchor_bboxes_shape) x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes, num=7, axis=-1) x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt = tf.unstack( assigned_gt_bboxes, num=7, axis=-1) diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a)) # The anchor dimensions is usually a hard-coded param given to the input # generator and should not be 0. We use CheckNumerics to ensure that is the # case. x_residual = py_utils.CheckNumerics((x_gt - x_a) / diagonal_xy) y_residual = py_utils.CheckNumerics((y_gt - y_a) / diagonal_xy) z_residual = py_utils.CheckNumerics((z_gt - z_a) / dz_a) dx_residual = py_utils.CheckNumerics(tf.log(dx_gt / dx_a)) dy_residual = py_utils.CheckNumerics(tf.log(dy_gt / dy_a)) dz_residual = py_utils.CheckNumerics(tf.log(dz_gt / dz_a)) phi_residual = phi_gt - phi_a return tf.stack([ x_residual, y_residual, z_residual, dx_residual, dy_residual, dz_residual, phi_residual, ], axis=-1) # pyformat: disable
def AddAttentionSummaryBatchMajor(attention_tensors, src_paddings, tgt_paddings, transcripts=None, max_outputs=3): """Adds an image summary showing the attention probability matrix and state. As opposed to AddAttentionSummary() takes all tensors with batch dimension in axis 0. Args: attention_tensors: A list of 3D tensors shaped [batch_size, target_len, source_len] where attention[b, i, j] is the probability for the i-th output attending to the j-th input for element b in the batch. src_paddings: A tensor of binary paddings shaped [batch, source_len] for the source sequence. Or a list of tensors of the same length as attention_tensors with a separate paddings for each entry in attention_tensors. tgt_paddings: A tensor of binary paddings shaped [batch, target_len] for the target sequence. Or a list of tensors of the same length as attention_tensors with a separate paddings for each entry in attention_tensors. transcripts: Optional, transcripts shaped [batch, source_len] for the source sequence. max_outputs: Integer maximum number of elements of the batch to plot. """ def VerifyLen(paddings): length = len(paddings) if isinstance(paddings, list) else 1 if length != 1 and length != len(attention_tensors): raise ValueError('Bad length of paddings list {}'.format(length)) VerifyLen(src_paddings) VerifyLen(tgt_paddings) name = attention_tensors[0].name + '/Attention' if not _ShouldAddSummary(): return def ToLengths(paddings): paddings = paddings if isinstance(paddings, list) else [paddings] return [SequenceLength(p) for p in paddings] def Get(lengths, i): return lengths[0 if len(lengths) == 1 else i] src_lens = ToLengths(src_paddings) tgt_lens = ToLengths(tgt_paddings) with plot.MatplotlibFigureSummary(name, max_outputs=max_outputs, gridspec_kwargs={'hspace': 0.3}) as fig: for n, atten in enumerate(attention_tensors): # Diagnostic metric that decreases as attention picks up. max_entropy = tf.log(tf.cast(Get(src_lens, n), tf.float32)) max_entropy = tf.expand_dims(tf.expand_dims(max_entropy, -1), -1) atten_normalized_entropy = -atten * tf.log(atten + 1e-10) / max_entropy scalar('Attention/average_normalized_entropy/%d' % n, tf.reduce_mean(atten_normalized_entropy)) args = [atten, Get(src_lens, n), Get(tgt_lens, n)] if transcripts is not None and n == 0: args.append(transcripts) fig.AddSubplot(args, TrimPaddingAndPlotAttention, title=atten.name, xlabel='Input', ylabel='Output')
def PreBeamSearchStepCallback(theta, encoder_outputs, step_ids, states, num_hyps_per_beam, *args, **kwargs): """Wrapper for adding bias to _PreBeamSearchStateCallback. Biases results.log_probs towards provided encoder_outputs.targets. Args: theta: a NestedMap of parameters. encoder_outputs: a NestedMap computed by encoder. step_ids: A tensor of shape [tgt_batch, 1]. states: A `.NestedMap` of tensors representing states that the clients would like to keep track of for each of the active hyps. num_hyps_per_beam: Beam size. *args: additional arguments to _PreBeamSearchStepCallback. **kwargs: additional arguments to _PreBeamSearchStepCallback. Returns: A tuple (results, out_states). results: A `.NestedMap` of beam search results. atten_probs: The updated attention probs, of shape [tgt_batch, src_len]. log_probs: Log prob for each of the tokens in the target vocab. This is of shape [tgt_batch, vocab_size]. out_states: a `.NestedMap` The updated states. The states relevant here are: time_step: A scalar indicating current step of decoder. Must be provided and maintained by subclass. consistent: A boolean vector of shape [tgt_batch, ] which tracks whether each hypothesis has exactly matched encoder_outputs.targets so far. """ p = self.params time_step = states.time_step bs_results, out_states = self._PreBeamSearchStepCallback( theta, encoder_outputs, step_ids, states, num_hyps_per_beam, *args, **kwargs) labels = encoder_outputs.targets.labels weights = encoder_outputs.targets.weights def TileForBeamAndFlatten(tensor): tensor = tf.reshape(tensor, [1, -1]) # [1, src_batch] tensor = tf.tile( tensor, [num_hyps_per_beam, 1]) # [num_hyps_per_beam, src_batch] tgt_batch = tf.shape(step_ids)[ 0] # num_hyps_per_beam*src_batch return tf.reshape(tensor, [tgt_batch]) # Consistent if step_ids == labels from previous step # TODO(navari): Consider updating consistent only if weights > 0. Then # re-evaluate the need for bias_only_if_consistent=True. # Note that prev_label is incorrrect for step 0 but is overridden later prev_label = TileForBeamAndFlatten( tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1)) is_step0 = tf.equal(time_step, 0) local_consistence = tf.logical_or( is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1))) out_states.consistent = tf.logical_and(states.consistent, local_consistence) # get label, weight slices corresponding to current time_step label = TileForBeamAndFlatten(tf.gather(labels, time_step, axis=1)) weight = TileForBeamAndFlatten( tf.gather(weights, time_step, axis=1)) if p.bias_only_if_consistent: weight = weight * tf.cast(out_states.consistent, p.dtype) # convert from dense label to sparse label probs vocab_size = tf.shape(bs_results.log_probs)[1] uncertainty = tf.constant( 1e-10, p.dtype) # avoid 0 probs which may cause issues with log label_probs = tf.one_hot(label, vocab_size, on_value=1 - uncertainty, off_value=uncertainty / tf.cast(vocab_size - 1, p.dtype), dtype=p.dtype) # [tgt_batch, vocab_size] pred_probs = tf.exp(bs_results.log_probs) # interpolate predicted probs and label probs weight = tf.expand_dims(weight, 1) probs = py_utils.with_dependencies([ py_utils.assert_less_equal(weight, 1.), py_utils.assert_greater_equal(weight, 0.) ], (1.0 - weight) * pred_probs + weight * label_probs) bs_results.log_probs = tf.log(probs) return bs_results, out_states