Ejemplo n.º 1
0
def DistanceBetweenCentroidsAndBBoxesFastAndFurious(centroids, bboxes, masks):
    """Computes the distance between centroids and bboxes.

  The distance/loss is loosely following the 'Fast and Furious' paper by Luo et
  al., CVPR'18.  This is just one way of calculating the distances. We will
  probably develop other ways.

  Args:
    centroids: [..., 4]. x/y/w/h for bboxes.
    bboxes: [..., 4]. ymin/xmin/ymax/xmax for bboxes.
    masks: [...]. masks[i] == 1 means i-th entry (centroids[i] and bboxes[i])
      should be considered in the distance/loss calculation.

  Returns:
    A [...] tensor. i-th value is the distance measure of centroids[i] and
    bboxes[i].
  """
    x, y, w, h = tf.unstack(centroids, axis=-1, num=4)
    # "gt" suffix means 'ground truth'.
    x_gt, y_gt, w_gt, h_gt = tf.unstack(BBoxesToXYWH(bboxes), axis=-1, num=4)

    def Pos(x):
        return tf.maximum(tf.constant(1e-8, x.dtype), x)

    # The following terms are zeros when masks[i] is 0.
    l_x = py_utils.CheckNumerics(masks * (x - x_gt) / Pos(w_gt))
    l_y = py_utils.CheckNumerics(masks * (y - y_gt) / Pos(h_gt))
    s_w = py_utils.CheckNumerics(masks * tf.log(Pos(w) / Pos(w_gt)))
    s_h = py_utils.CheckNumerics(masks * tf.log(Pos(h) / Pos(h_gt)))
    return (_SmoothL1Norm(l_x) + _SmoothL1Norm(l_y) + _SmoothL1Norm(s_w) +
            _SmoothL1Norm(s_h))
Ejemplo n.º 2
0
    def _testHelper(self, base_frnn_p, frnn_p, packed_input=False):
        inputs, padding, m0, c0, segment_id = self._GetTestInputs(packed_input)
        base_frnn = base_frnn_p.Instantiate()
        frnn = frnn_p.Instantiate()

        with self.session() as sess:
            tf.global_variables_initializer().run()

            state0 = py_utils.NestedMap(m=m0, c=c0)
            act, state = base_frnn.FPropDefaultTheta(inputs,
                                                     padding,
                                                     state0=state0,
                                                     segment_id=segment_id)
            # Compute grads
            loss = -tf.log(
                tf.sigmoid((tf.reduce_sum(tf.math.square(act)) +
                            tf.reduce_sum(state.m * state.c * state.c))))
            grads = tf.gradients(loss, base_frnn.vars.Flatten())

            expected_act, expected_state, expected_grads = sess.run(
                [act, state, grads])

            act, state = frnn.FPropDefaultTheta(inputs,
                                                padding,
                                                state0=state0,
                                                segment_id=segment_id)
            # Compute grads
            loss = -tf.log(
                tf.sigmoid((tf.reduce_sum(tf.math.square(act)) +
                            tf.reduce_sum(state.m * state.c * state.c))))
            grads = tf.gradients(loss, frnn.vars.Flatten())

            actual_act, actual_state, actual_grads = sess.run(
                [act, state, grads])

        tf.logging.info('expected_act:{}'.format(expected_act))
        tf.logging.info('actual_act:{}'.format(actual_act))

        tf.logging.info('expected_state:{}'.format(expected_state))
        tf.logging.info('actual_state:{}'.format(actual_state))

        tf.logging.info('expected_grads:{}'.format(expected_grads))
        tf.logging.info('actual_grads:{}'.format(actual_grads))

        self.assertAllClose(expected_act, actual_act)
        self.assertAllClose(expected_state.m, actual_state.m)
        self.assertAllClose(expected_state.c, actual_state.c)
        for (vname, _), expected, actual in zip(frnn.vars.FlattenItems(),
                                                expected_grads, actual_grads):
            self.assertAllClose(expected, actual, msg=vname)
Ejemplo n.º 3
0
    def testScaleGradientsCheckNumerics(self):
        """ScaleGradients when enable_check_numerics=True."""
        FLAGS.enable_check_numerics = True
        p = self.TestParams()
        p.input = base_input_generator.BaseSequenceInputGenerator.Params()
        task = p.Instantiate()
        task.CreateVariable(
            'a',
            py_utils.WeightParams(shape=[],
                                  init=py_utils.WeightInit.Constant(0)))
        var_a = task.theta.a
        # Make a NaN gradient.
        var_grads = py_utils.NestedMap(a=(var_a, 0. * tf.log(0.)))
        scaled_grads_map = task.learners[0].ScaleGradients(var_grads)

        with self.session():
            tf.global_variables_initializer().run()
            with self.assertRaisesRegexp(tf.errors.InvalidArgumentError,
                                         'is not finite'):
                self.assertTrue(scaled_grads_map.has_nan_or_inf.eval())
                self.assertEqual(0., scaled_grads_map.grad_scale.eval())
                # The final gradient must be finite.
                self.assertFalse(
                    tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval())
                self.assertTrue(
                    tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
Ejemplo n.º 4
0
def AddAttentionSummaryBatchMajor(attention_tensors,
                                  src_paddings,
                                  tgt_paddings,
                                  transcripts=None,
                                  max_outputs=3):
    """Adds an image summary showing the attention probability matrix and state.

  As opposed to AddAttentionSummary() takes all tensors with batch dimension in
  axis 0.

  Args:
    attention_tensors: A list of 3D tensors shaped [batch_size, target_len,
      source_len] where attention[b, i, j] is the probability for the i-th
      output attending to the j-th input for element b in the batch.
    src_paddings: A tensor of binary paddings shaped [batch, source_len] for the
      source sequence.
    tgt_paddings: A tensor of binary paddings shaped [batch, target_len] for the
      target sequence.
    transcripts: Optional, transcripts shaped [batch, source_len] for the source
      sequence.
    max_outputs: Integer maximum number of elements of the batch to plot.
  """
    name = attention_tensors[0].name + '/Attention'
    if not _ShouldAddSummary():
        return
    with plot.MatplotlibFigureSummary(name, max_outputs=max_outputs) as fig:
        src_lens = SequenceLength(src_paddings)
        tgt_lens = SequenceLength(tgt_paddings)
        for n, atten in enumerate(attention_tensors):
            # Diagnostic metric that decreases as attention picks up.
            max_entropy = tf.log(tf.cast(src_lens, tf.float32))
            max_entropy = tf.expand_dims(tf.expand_dims(max_entropy, -1), -1)
            atten_normalized_entropy = -atten * tf.log(atten +
                                                       1e-10) / max_entropy
            scalar('Attention/average_normalized_entropy/%d' % n,
                   tf.reduce_mean(atten_normalized_entropy))
            args = [atten, src_lens, tgt_lens]
            if transcripts is not None and n == 0:
                args.append(transcripts)
            fig.AddSubplot(args,
                           TrimPaddingAndPlotAttention,
                           title=atten.name,
                           xlabel='Input',
                           ylabel='Output')
Ejemplo n.º 5
0
            def ApplyBias():
                """Bias and update log_probs and consistent."""
                def TileForBeamAndFlatten(tensor):
                    tensor = tf.reshape(tensor, [1, -1])  # [1, src_batch]
                    tensor = tf.tile(tensor,
                                     [num_hyps_per_beam, 1
                                      ])  # [num_hyps_per_beam, src_batch]
                    tgt_batch = tf.shape(step_ids)[
                        0]  # num_hyps_per_beam*src_batch
                    return tf.reshape(tensor, [tgt_batch])

                # Consistent if step_ids == labels from previous step
                # TODO(navari): Consider updating consistent only if weights > 0. Then
                # re-evaluate the need for bias_only_if_consistent=True.
                # Note that prev_label is incorrrect for step 0 but is overridden later
                prev_label = TileForBeamAndFlatten(
                    tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1))
                is_step0 = tf.equal(time_step, 0)
                local_consistence = tf.logical_or(
                    is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1)))
                consistent = tf.logical_and(states.consistent,
                                            local_consistence)

                # get label, weight slices corresponding to current time_step
                label = TileForBeamAndFlatten(
                    tf.gather(labels, time_step, axis=1))
                weight = TileForBeamAndFlatten(
                    tf.gather(weights, time_step, axis=1))
                if p.bias_only_if_consistent:
                    weight = weight * tf.cast(consistent, p.dtype)

                # convert from dense label to sparse label probs
                vocab_size = tf.shape(bs_results.log_probs)[1]
                uncertainty = tf.constant(
                    1e-10,
                    p.dtype)  # avoid 0 probs which may cause issues with log
                label_probs = tf.one_hot(
                    label,
                    vocab_size,
                    on_value=1 - uncertainty,
                    off_value=uncertainty / tf.cast(vocab_size - 1, p.dtype),
                    dtype=p.dtype)  # [tgt_batch, vocab_size]
                pred_probs = tf.exp(bs_results.log_probs)

                # interpolate predicted probs and label probs
                weight = tf.expand_dims(weight, 1)
                probs = py_utils.with_dependencies([
                    py_utils.assert_less_equal(weight, 1.),
                    py_utils.assert_greater_equal(weight, 0.)
                ], (1.0 - weight) * pred_probs + weight * label_probs)
                return tf.log(probs), consistent
Ejemplo n.º 6
0
  def testScaleGradientsInf(self):
    FLAGS.enable_check_numerics = False
    p = self.TestParams()
    p.input = base_input_generator.BaseSequenceInputGenerator.Params()
    task = p.Instantiate()
    task.CreateVariable(
        'a',
        py_utils.WeightParams(shape=[], init=py_utils.WeightInit.Constant(0)))
    var_a = task.theta.a
    # Infinite gradient.
    var_grads = py_utils.NestedMap(a=py_utils.VarGrad(var_a, tf.log(0.)))
    scaled_grads_map = task.learners[0].ScaleGradients(var_grads)

    with self.session():
      tf.global_variables_initializer().run()
      self.assertEqual(0., scaled_grads_map.grad_scale.eval())
      # The final gradient must be finite.
      self.assertFalse(tf.is_nan(scaled_grads_map.final_var_grads.a[1]).eval())
      self.assertTrue(
          tf.is_finite(scaled_grads_map.final_var_grads.a[1]).eval())
Ejemplo n.º 7
0
  def _FPropChunk(self, theta, pcm_audio_chunk, pcm_audio_paddings):
    p = self.params
    pcm_audio_chunk = tf.cast(pcm_audio_chunk, tf.float32)
    # shape: [batch, time, _frame_size]
    framed_signal = tf.signal.frame(pcm_audio_chunk, self._frame_size,
                                    self._frame_step, p.pad_end)

    # Pre-emphasis.
    if p.preemph != 1.0:
      preemphasized = self._ApplyPreemphasis(framed_signal)
    else:
      preemphasized = framed_signal[:-1]

    # Noise.
    if p.noise_scale > 0.0:
      noise_signal = tf.random_normal(
          tf.shape(preemphasized),
          stddev=p.noise_scale,
          mean=0.0,
          seed=p.random_seed)
    else:
      noise_signal = 0.0

    # Apply window fn.
    windowed_signal = preemphasized + noise_signal
    if self._window_fn is not None:
      window = self._window_fn(self._frame_size - 1, framed_signal.dtype)
      windowed_signal *= window

    mel_spectrogram = self._MelSpectrogram(windowed_signal)

    output_floor = 1.0
    mel_spectrogram_log = tf.log(
        tf.maximum(float(output_floor), mel_spectrogram))

    # Mean and stddev.
    mel_spectrogram_norm = (
        (mel_spectrogram_log - tf.convert_to_tensor(p.per_bin_mean)) /
        tf.convert_to_tensor(p.per_bin_stddev))
    return mel_spectrogram_norm, self._GetMelPadding(pcm_audio_paddings)
Ejemplo n.º 8
0
    def LocalizationResiduals(self, anchor_bboxes, assigned_gt_bboxes):
        """Computes the anchor residuals for every bbox.

    For a given bbox, compute residuals in the following way:

      Let ``anchor_bbox = (x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a)``
      and ``assigned_gt_bbox = (x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt)``

      Define ``diagonal_xy = sqrt(dx_a^2 + dy_a^2)``

      Then the corresponding residuals are given by::

        x_residual = (x_gt - x_a) / (diagonal_xy)
        y_residual = (y_gt - y_a) / (diagonal_xy)
        z_residual = (z_gt - z_a) / (dz_a)

        dx_residual = log(dx_gt / dx_a)
        dy_residual = log(dy_gt / dy_a)
        dz_residual = log(dz_gt / dz_a)

        phi_residual = phi_gt - phi_a

      The normalization for x and y residuals by the diagonal was first
      proposed by [1]. Intuitively, this reflects that objects can usually
      move freely in the x-y plane, including diagonally. On the other hand,
      moving in the z-axis (up and down) can be considered orthogonal to x-y.

      For phi_residual, one way to frame the loss is with
      SmoothL1(sine(phi_residual - phi_predicted)).
      The use of sine to wrap the phi residual was proposed by [2]. This
      stems from the observation that bboxes at phi and phi + pi are the same
      bbox, fully overlapping in 3D space, except that the direction is
      different. Note that the use of sine makes this residual invariant to
      direction when a symmetric loss like SmoothL1 is used. In
      ResidualsToBBoxes, we ensure that the phi predicted is between [0, pi).

    The Huber (SmoothL1) loss can then be applied to the delta between these
    target residuals and the model predicted residuals.

    [1] VoxelNet: End-to-End Learning for Point Cloud Based 3D Object Detection
        https://arxiv.org/abs/1711.06396

    [2] SECOND: Sparsely Embedded Convolutional Detection
        https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf

    Args:
      anchor_bboxes: tf.float32. where [..., :7] contains (x, y, z, dx, dy, dz,
        phi), corresponding to each anchor bbox parameters.
      assigned_gt_bboxes: tf.float32 of the same shape as anchor_bboxes
        containing the corresponding assigned ground-truth bboxes.

    Returns:
      A tf.float32 tensor of the same shape as anchor_bboxes with target
      residuals for every corresponding bbox.
    """
        anchor_bboxes_shape = py_utils.GetShape(anchor_bboxes)
        anchor_bboxes = py_utils.with_dependencies(
            [py_utils.assert_equal(anchor_bboxes_shape[-1], 7)], anchor_bboxes)
        assigned_gt_bboxes = py_utils.HasShape(assigned_gt_bboxes,
                                               anchor_bboxes_shape)

        x_a, y_a, z_a, dx_a, dy_a, dz_a, phi_a = tf.unstack(anchor_bboxes,
                                                            num=7,
                                                            axis=-1)
        x_gt, y_gt, z_gt, dx_gt, dy_gt, dz_gt, phi_gt = tf.unstack(
            assigned_gt_bboxes, num=7, axis=-1)

        diagonal_xy = tf.sqrt(tf.square(dx_a) + tf.square(dy_a))

        # The anchor dimensions is usually a hard-coded param given to the input
        # generator and should not be 0. We use CheckNumerics to ensure that is the
        # case.
        x_residual = py_utils.CheckNumerics((x_gt - x_a) / diagonal_xy)
        y_residual = py_utils.CheckNumerics((y_gt - y_a) / diagonal_xy)
        z_residual = py_utils.CheckNumerics((z_gt - z_a) / dz_a)

        dx_residual = py_utils.CheckNumerics(tf.log(dx_gt / dx_a))
        dy_residual = py_utils.CheckNumerics(tf.log(dy_gt / dy_a))
        dz_residual = py_utils.CheckNumerics(tf.log(dz_gt / dz_a))

        phi_residual = phi_gt - phi_a

        return tf.stack([
            x_residual,
            y_residual,
            z_residual,
            dx_residual,
            dy_residual,
            dz_residual,
            phi_residual,
        ],
                        axis=-1)  # pyformat: disable
Ejemplo n.º 9
0
def AddAttentionSummaryBatchMajor(attention_tensors,
                                  src_paddings,
                                  tgt_paddings,
                                  transcripts=None,
                                  max_outputs=3):
    """Adds an image summary showing the attention probability matrix and state.

  As opposed to AddAttentionSummary() takes all tensors with batch dimension in
  axis 0.

  Args:
    attention_tensors: A list of 3D tensors shaped [batch_size, target_len,
      source_len] where attention[b, i, j] is the probability for the i-th
      output attending to the j-th input for element b in the batch.
    src_paddings: A tensor of binary paddings shaped [batch, source_len] for the
      source sequence. Or a list of tensors of the same length as
      attention_tensors with a separate paddings for each entry in
      attention_tensors.
    tgt_paddings: A tensor of binary paddings shaped [batch, target_len] for the
      target sequence. Or a list of tensors of the same length as
      attention_tensors with a separate paddings for each entry in
      attention_tensors.
    transcripts: Optional, transcripts shaped [batch, source_len] for the source
      sequence.
    max_outputs: Integer maximum number of elements of the batch to plot.
  """
    def VerifyLen(paddings):
        length = len(paddings) if isinstance(paddings, list) else 1
        if length != 1 and length != len(attention_tensors):
            raise ValueError('Bad length of paddings list {}'.format(length))

    VerifyLen(src_paddings)
    VerifyLen(tgt_paddings)

    name = attention_tensors[0].name + '/Attention'
    if not _ShouldAddSummary():
        return

    def ToLengths(paddings):
        paddings = paddings if isinstance(paddings, list) else [paddings]
        return [SequenceLength(p) for p in paddings]

    def Get(lengths, i):
        return lengths[0 if len(lengths) == 1 else i]

    src_lens = ToLengths(src_paddings)
    tgt_lens = ToLengths(tgt_paddings)

    with plot.MatplotlibFigureSummary(name,
                                      max_outputs=max_outputs,
                                      gridspec_kwargs={'hspace': 0.3}) as fig:
        for n, atten in enumerate(attention_tensors):
            # Diagnostic metric that decreases as attention picks up.
            max_entropy = tf.log(tf.cast(Get(src_lens, n), tf.float32))
            max_entropy = tf.expand_dims(tf.expand_dims(max_entropy, -1), -1)
            atten_normalized_entropy = -atten * tf.log(atten +
                                                       1e-10) / max_entropy
            scalar('Attention/average_normalized_entropy/%d' % n,
                   tf.reduce_mean(atten_normalized_entropy))
            args = [atten, Get(src_lens, n), Get(tgt_lens, n)]
            if transcripts is not None and n == 0:
                args.append(transcripts)
            fig.AddSubplot(args,
                           TrimPaddingAndPlotAttention,
                           title=atten.name,
                           xlabel='Input',
                           ylabel='Output')
Ejemplo n.º 10
0
        def PreBeamSearchStepCallback(theta, encoder_outputs, step_ids, states,
                                      num_hyps_per_beam, *args, **kwargs):
            """Wrapper for adding bias to _PreBeamSearchStateCallback.

      Biases results.log_probs towards provided encoder_outputs.targets.

      Args:
        theta: a NestedMap of parameters.
        encoder_outputs: a NestedMap computed by encoder.
        step_ids: A tensor of shape [tgt_batch, 1].
        states: A `.NestedMap` of tensors representing states that the clients
          would like to keep track of for each of the active hyps.
        num_hyps_per_beam: Beam size.
        *args: additional arguments to _PreBeamSearchStepCallback.
        **kwargs: additional arguments to _PreBeamSearchStepCallback.

      Returns:
        A tuple (results, out_states).
        results: A `.NestedMap` of beam search results.
          atten_probs:
            The updated attention probs, of shape [tgt_batch, src_len].
          log_probs:
            Log prob for each of the tokens in the target vocab. This is of
            shape
            [tgt_batch, vocab_size].
        out_states: a `.NestedMap` The updated states. The states relevant here
          are:
          time_step: A scalar indicating current step of decoder.  Must be
            provided and maintained by subclass.
          consistent: A boolean vector of shape [tgt_batch, ] which tracks
              whether each hypothesis has exactly matched
              encoder_outputs.targets
              so far.
      """
            p = self.params
            time_step = states.time_step
            bs_results, out_states = self._PreBeamSearchStepCallback(
                theta, encoder_outputs, step_ids, states, num_hyps_per_beam,
                *args, **kwargs)

            labels = encoder_outputs.targets.labels
            weights = encoder_outputs.targets.weights

            def TileForBeamAndFlatten(tensor):
                tensor = tf.reshape(tensor, [1, -1])  # [1, src_batch]
                tensor = tf.tile(
                    tensor,
                    [num_hyps_per_beam, 1])  # [num_hyps_per_beam, src_batch]
                tgt_batch = tf.shape(step_ids)[
                    0]  # num_hyps_per_beam*src_batch
                return tf.reshape(tensor, [tgt_batch])

            # Consistent if step_ids == labels from previous step
            # TODO(navari): Consider updating consistent only if weights > 0. Then
            # re-evaluate the need for bias_only_if_consistent=True.
            # Note that prev_label is incorrrect for step 0 but is overridden later
            prev_label = TileForBeamAndFlatten(
                tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1))
            is_step0 = tf.equal(time_step, 0)
            local_consistence = tf.logical_or(
                is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1)))
            out_states.consistent = tf.logical_and(states.consistent,
                                                   local_consistence)

            # get label, weight slices corresponding to current time_step
            label = TileForBeamAndFlatten(tf.gather(labels, time_step, axis=1))
            weight = TileForBeamAndFlatten(
                tf.gather(weights, time_step, axis=1))
            if p.bias_only_if_consistent:
                weight = weight * tf.cast(out_states.consistent, p.dtype)

            # convert from dense label to sparse label probs
            vocab_size = tf.shape(bs_results.log_probs)[1]
            uncertainty = tf.constant(
                1e-10,
                p.dtype)  # avoid 0 probs which may cause issues with log
            label_probs = tf.one_hot(label,
                                     vocab_size,
                                     on_value=1 - uncertainty,
                                     off_value=uncertainty /
                                     tf.cast(vocab_size - 1, p.dtype),
                                     dtype=p.dtype)  # [tgt_batch, vocab_size]
            pred_probs = tf.exp(bs_results.log_probs)

            # interpolate predicted probs and label probs
            weight = tf.expand_dims(weight, 1)
            probs = py_utils.with_dependencies([
                py_utils.assert_less_equal(weight, 1.),
                py_utils.assert_greater_equal(weight, 0.)
            ], (1.0 - weight) * pred_probs + weight * label_probs)

            bs_results.log_probs = tf.log(probs)

            return bs_results, out_states