Example #1
  def _GetWarpMatrix(self,
    """Returns warp matrices starting from random positions.

    In this function when max_warp_frames != None:
      1) Sample random warp displacements from the interval
         [-max_warp_frames, max_warp_frames) to yield shift tensor
         with shape (batch_size,).
      2) Truncate lengths to a maximum magnitude of (choose_range * max_ratio),
         so that each shift is fully contained within the
         corresponding sequence.
      3) Random sample origin points of shape (batch_size, multiplicity)
         with in [shift, choose_range - shift).
      4) Return a batch of 1-D linear maps that fix the boundary points and
         shift the origin point by the shift.

    When max_warp_frames == None:
      1) Sample random warp displacements with magnitudes less than
         (choose_range * max_ratio) to yield shift tensor with
         shape (batch_size,).
      2) Proceed through steps 3), 4).

      batch_size: Batch size. Integer number.
      choose_range: Range within which the warp reference points must lie.
        Tensor of shape (batch_size,).
      matrix_size: Dimension of vector space warp matrix is applied to. Integer
      global_seed: an integer seed tensor for stateless random ops.
      max_warp_frames: Upper-bound on the warp distance. Integer or None.
      dtype: Data type.
      max_ratio: Maximum ratio between the shift distance and choose_range.
        Float number.

      warp_matrix: An array of fixed size warp matrices with shape
      (batch_size, matrix_size, matrix_size).
    p = self.params
    # Non-empty random seed values are only used for testing or when using
    # stateless random ops. seed_3, seed_4, and seed_5 are set separately to
    # avoid correlation of warp magnitude and origin position.
    if p.use_input_dependent_random_seed:
      seed_3 = global_seed + 3
      seed_4 = global_seed + 4
      seed_5 = global_seed + 5
    elif p.random_seed:
      seed_3 = p.random_seed - 1
      seed_4 = p.random_seed - 1
      seed_5 = 2 * p.random_seed + 1
      seed_3 = p.random_seed
      seed_4 = p.random_seed
      seed_5 = p.random_seed

    choose_range_dtype = tf.cast(choose_range, dtype=dtype)
    length_upper_bound = tf.cast(max_ratio * choose_range_dtype, dtype=tf.int32)
    # Set shift length.

    random_uniform = _random_uniform_op(p.use_input_dependent_random_seed)

    if max_warp_frames and max_warp_frames > 0:
      shift = random_uniform(
          minval=-1 * max_warp_frames,
          maxval=max_warp_frames + 1,
      random_ratio = random_uniform(
      shift = tf.cast(random_ratio * tf.cast(length_upper_bound, dtype=dtype),
    # Make sure the sampled length was smaller than max_ratio * length_bound.
    # Note that sampling in this way is biased.
    # (Shorter sequence may over-masked.)
    final_shift = tf.maximum(-length_upper_bound,
                             tf.minimum(shift, length_upper_bound))
    # Choose origin anchor point.
    mid_range = tf.cast(choose_range, dtype=tf.int32)
    mid_range = tf.maximum(choose_range - 2, 0)
    random_origin = random_uniform(shape=(batch_size,), maxval=1.0, seed=seed_5)
    origin_with_in_valid_range = random_origin * tf.cast(mid_range, dtype=dtype)
    origin = tf.cast(origin_with_in_valid_range, tf.int32) + 1
    # Set destination point of the origin anchor point under the warp map.
    destination = origin + final_shift
    # Cast origin and destination.
    origin = tf.cast(origin, dtype=dtype)
    destination = tf.cast(destination, dtype=dtype)

    return self._ConstructWarpMatrix(
Example #2
    def _XYZFromRangeImage(self,
        """Extract the cartesian coordinates from the range image.

       lidar_image: [H, W, C] range image Tensor.
       lidar_image_mask: [H, W] boolean indicating which 2d coordinates in the
         lidar image are present.
       extrinsics: [4, 4] float matrix representing transformation matrix to
         world coordinates.
       inclinations: [V] beam inclinations vector.
       pixel_pose: [64, 2650, 4, 4] tensor representing per pixel pose of GBR.
       frame_pose: [4, 4] matrix representing vehicle to world transformation.

      [H, W, 3] range image cartesian coordinates.
        height, width, channels = py_utils.GetShape(lidar_image, 3)

        conversion_dtype = tf.float32
        lidar_image = tf.cast(lidar_image, conversion_dtype)
        extrinsics = tf.cast(extrinsics, conversion_dtype)
        inclinations = tf.cast(inclinations, conversion_dtype)
        inclinations = tf.reverse(inclinations, axis=[-1])

        az_correction = py_utils.HasShape(
            tf.atan2(extrinsics[1, 0], extrinsics[0, 0]), [])
        ratios = (tf.cast(tf.range(width, 0, -1), dtype=conversion_dtype) -
                  .5) / tf.cast(width, conversion_dtype)
        ratios = py_utils.HasShape(ratios, [width])

        azimuth = (ratios * 2. - 1.) * np.pi - az_correction[..., tf.newaxis]
        azimuth = py_utils.HasShape(azimuth, [width])

        lidar_image_mask = lidar_image_mask[..., tf.newaxis]
        lidar_image_mask = tf.tile(lidar_image_mask, [1, 1, channels])
        lidar_image = tf.where(lidar_image_mask, lidar_image,
        lidar_image_range = lidar_image[..., 0]

        azimuth = py_utils.HasShape(azimuth[tf.newaxis, ...], [1, width])
        inclinations = py_utils.HasShape(inclinations[..., tf.newaxis],
                                         [height, 1])

        cos_azimuth = tf.cos(azimuth)
        sin_azimuth = tf.sin(azimuth)
        cos_incl = tf.cos(inclinations)
        sin_incl = tf.sin(inclinations)

        x = cos_azimuth * cos_incl * lidar_image_range
        y = sin_azimuth * cos_incl * lidar_image_range
        z = sin_incl * lidar_image_range

        lidar_image_points = tf.stack([x, y, z], -1)
        lidar_image_points = py_utils.HasShape(lidar_image_points,
                                               [height, width, 3])
        rotation = extrinsics[0:3, 0:3]
        translation = extrinsics[0:3, 3][tf.newaxis, ...]

        # Transform the image points in cartesian coordinates to
        # the world coordinate system using the extrinsics matrix.
        # We first flatten the points, apply rotation, then
        # reshape to restore the original input and then apply
        # translation.
        lidar_image_points = tf.matmul(tf.reshape(lidar_image_points, [-1, 3]),
        lidar_image_points = tf.reshape(lidar_image_points, [height, width, 3])
        lidar_image_points += translation

        lidar_image_points = py_utils.HasShape(lidar_image_points,
                                               [height, width, 3])
        # GBR uses per pixel pose.
        if pixel_pose is not None:
            pixel_pose_rotation = pixel_pose[..., 0:3, 0:3]
            pixel_pose_translation = pixel_pose[..., 0:3, 3]
            lidar_image_points = tf.einsum(
                'hwij,hwj->hwi', pixel_pose_rotation,
                lidar_image_points) + pixel_pose_translation
            if frame_pose is None:
                raise ValueError(
                    'frame_pose must be set when pixel_pose is set.')
            # To vehicle frame corresponding to the given frame_pose
            # [4, 4]
            world_to_vehicle = tf.linalg.inv(frame_pose)
            world_to_vehicle_rotation = world_to_vehicle[0:3, 0:3]
            world_to_vehicle_translation = world_to_vehicle[0:3, 3]
            # [H, W, 3]
            lidar_image_points = tf.einsum(
                'ij,hwj->hwi', world_to_vehicle_rotation, lidar_image_points
            ) + world_to_vehicle_translation[tf.newaxis, tf.newaxis, :]

        return lidar_image_points
Example #3
def _SingleClassDecodeWithNMS(predicted_bboxes,
    """Perform NMS on predicted bounding boxes / associated logits.

    predicted_bboxes: [batch_size, num_boxes, 7] float Tensor containing
      predicted bounding box coordinates.
    classification_scores: [batch_size, num_boxes, num_classes] float Tensor
      containing predicted classification scores for each box.
    nms_iou_threshold: IoU threshold to use when determining whether two boxes
      overlap for purposes of suppression.
    score_threshold: The score threshold passed to NMS that allows NMS to
      quickly ignore irrelevant boxes.
    max_boxes_per_class: The maximum number of boxes per example to emit. If
      None, this value is set to num_boxes from the shape of predicted_bboxes.

    nms_indices: Indices of the boxes selected after NMS. Tensor of shape
      [batch_size, num_classes, max_boxes_per_class].
    predicted_bboxes: Filtered bboxes after NMS of shape
      [batch_size, num_classes, max_boxes_per_class, 7].
    bbox_scores: A float32 Tensor with the score for each box of shape
      [batch_size, num_classes, max_boxes_per_class].
    valid_mask: A float32 Tensor with 1/0 values indicating the validity of
      each box. 1 indicates valid, and 0 invalid. Tensor of shape
      [batch_size, num_classes, max_boxes_per_class].
    utils_3d = detection_3d_lib.Utils3D()
    predicted_bboxes = py_utils.HasShape(predicted_bboxes, [-1, -1, 7])
    batch_size, num_predicted_boxes, _ = py_utils.GetShape(predicted_bboxes)
    classification_scores = py_utils.HasShape(
        classification_scores, [batch_size, num_predicted_boxes, -1])
    _, _, num_classes = py_utils.GetShape(classification_scores)

    if not isinstance(nms_iou_threshold, float):
        raise ValueError('Single class NMS only supports a scalar '
    if not isinstance(score_threshold, float):
        raise ValueError('Single class NMS only supports a scalar '

    if max_boxes_per_class is None:
        max_boxes_per_class = num_predicted_boxes

    # TODO(jngiam): Change to be per-class bboxes, and hence, per-class NMS, and
    # per-class thresholding.
    # [batch, num_predicted_boxes]
    nms_scores = tf.reduce_max(classification_scores, axis=-1)

    # Compute the most likely label by computing the highest class score from
    # the output of the sigmoid.
    likely_labels = tf.argmax(classification_scores, axis=-1)

    # When background is the most likely class for the box, mask out the scores
    # of that box from NMS scoring so the background boxes don't dominate the
    # NMS.
    nms_scores *= tf.cast(likely_labels > 0, tf.float32)

    # Compute NMS for every sample in the batch.
    nms_indices, valid_mask = utils_3d.BatchedNMSIndices(

    # Reorder the box data and logits according to NMS scoring.
    predicted_bboxes = tf.array_ops.batch_gather(predicted_bboxes, nms_indices)
    classification_scores = tf.array_ops.batch_gather(classification_scores,

    # Now reformat the output of NMS to match the format of the
    # MultiClassOrientedDecodeWithNMS, which outputs a per class NMS result.
    # This takes the leading shape of
    # [batch_size, num_classes, max_boxes_per_class] for all outputs, which
    # means since this NMS is not class specific we need to tile the outputs
    # num_classes times or reorder the data such that its [batch, num_classes].
    predicted_bboxes = tf.tile(predicted_bboxes[:, tf.newaxis, :, :],
                               [1, num_classes, 1, 1])
    classification_scores = tf.transpose(classification_scores, (0, 2, 1))
    classification_scores = py_utils.HasShape(
        classification_scores, [batch_size, num_classes, max_boxes_per_class])
    valid_mask = tf.tile(valid_mask[:, tf.newaxis, :], [1, num_classes, 1])
    return nms_indices, predicted_bboxes, classification_scores, valid_mask
Example #4
    def FProp(self, theta, input_batch):
        """Embeds source ids and transforms with TransformerStack.

      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      input_batch: A `.NestedMap` object containing: ids - The inputs tensor of
        shape [batch, time]. paddings - The ids' paddings of shape [batch,

      A '.NestedMap' object containing:
        encoded - The encoded features of shape [time, batch, dim] or [batch,
          time, dim], depending p.output_data_format.
        padding - The encoded features' padding of shape [time, batch] or
          [batch, time].
        segment_id - The segmentation of packed inputs of shape [time, batch] or
          [batch, time] if it is supported by the model, or None otherwise.
        embedded_inputs - The embedded inputs tokens without positional
          encodings of shape [time, batch, dim] or [batch, time, dim].

        p = self.params
        with tf.name_scope(p.name):
            # [batch, time]
            input_ids = input_batch.ids
            # [batch, time]
            paddings = input_batch.paddings

            # [batch, time]
            segment_ids = input_batch.segment_ids if p.packed_input else None

            batch = py_utils.GetShape(input_ids)[0]
            time = py_utils.GetShape(input_ids)[1]

            # Embedding layer.
            # [batch, time, dim]
            if not p.shared_emb:
                input_embs = self.token_emb.EmbLookup(theta.token_emb,
                input_embs = self.softmax.EmbLookup(theta.softmax, input_ids)
            orig_input_embs = input_embs

            # [1, time, dim]
            if p.packed_input:
                positions = input_batch.segment_pos
                position_embs = tf.expand_dims(
                        theta.position_emb, positions), 0)
                position_embs = tf.expand_dims(
                    self.position_emb.FProp(theta.position_emb, time), 0)

            # [batch, time, dim]
            input_embs += position_embs

            if p.input_dropout_tpl.fprop_dtype:
                input_embs = tf.cast(input_embs,
                paddings = tf.cast(paddings, p.input_dropout_tpl.fprop_dtype)

            input_embs = self.input_dropout.FProp(theta.input_dropout,
            # [batch, time, dim]
            transformer_input = input_embs
            # Explicitly set the input shape of Transformer layers, to avoid
            # unknown shape error occurred to tf.einsum on nonTPU devices.
            transformer_input = tf.reshape(transformer_input,
                                           [batch, time, p.model_dim])

            # Compute self-attention segment mask once.
            if p.packed_input:
                segment_mask = batch_major_attention.SegmentMask(
                    segment_ids, segment_ids, dtype=transformer_input.dtype)
                segment_mask = tf.zeros([batch, 1, time, time])

            encoded, padding = self.transformer_stack.FProp(
                theta.transformer_stack, transformer_input, paddings,

            if p.final_layer_norm:
                encoded = self.final_ln.FProp(theta.final_ln, encoded)

            seq_lengths = tf.cast(tf.reduce_sum(1. - padding, axis=1),

            if p.output_data_format == 'TBC':
                encoded = tf.transpose(encoded,
                                       [1, 0, 2])  # [time, batch, dim]
                padding = tf.transpose(padding)  # [time, batch]
                segment_ids = tf.transpose(
                    segment_ids) if p.packed_input else None
                orig_input_embs = tf.transpose(orig_input_embs, [1, 0, 2])

            return py_utils.NestedMap(
                seq_lengths=seq_lengths,  # used by beam_search_helper.
Example #5
    def BeamSearchDecode(self,
        """Performs beam-search based decoding.

      theta: A NestedMap object containing weights' values of the decoder layer
        and its children layers.
      encoder_outputs: A NestedMap containing encoder outputs to be passed to
        the callbacks. Mostly opaque to BeamSearchHelper, except that it should
        contain either a 'seq_lengths' field of shape [source_batch_size] or
        a 'paddings' field of shape [source_max_lengths, source_batch_size].
      num_hyps_per_beam_override: If set to a value <= 0, this parameter is
        ignored. If set to a value > 0, then this value will be used to override
      init_beam_search_state: The `InitBeamSearchState` callback. Please refer
        to the class header comments for more details.
      pre_beam_search_step_callback: The `PreBeamSearchStepCallback` callback.
        Please refer to the class header comments for more details.
      post_beam_search_step_callback: The `PostBeamSearchStepCallback` callback.
        Please refer to the class header comments for more details.
      max_steps: maximum beam search steps. If None, use

      A `BeamSearchDecodeOutput`.
        p = self.params
        num_hyps_per_beam = p.num_hyps_per_beam
        if num_hyps_per_beam_override > 0:
            num_hyps_per_beam = num_hyps_per_beam_override
        if max_steps is None:
            max_steps = p.target_seq_len

        initial_results, other_states = init_beam_search_state(
            theta, encoder_outputs, num_hyps_per_beam)

        num_hyps = tf.shape(initial_results.log_probs)[0]
        num_beams = num_hyps // num_hyps_per_beam

        if 'step_ids' in initial_results:
            # [num_hyps, 1]
            step_ids = tf.ensure_shape(initial_results.step_ids, [None, 1])
            step_ids = tf.fill([num_hyps, 1],
                               tf.constant(p.target_sos_id, dtype=tf.int32))

        min_score = -1e36
        best_scores = (tf.zeros(shape=[num_beams], dtype=p.dtype) + min_score)
        cumulative_scores = tf.zeros(shape=[num_hyps], dtype=p.dtype)
        in_scores = tf.zeros([max_steps, num_hyps], dtype=p.dtype)
        in_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.int32)
        in_prev_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.int32)
        in_done_hyps = tf.zeros([max_steps, num_hyps], dtype=tf.string)
        bs_atten_probs = tf.zeros(
            [max_steps, num_hyps,
        cur_step = tf.constant(0, dtype=tf.int32)
        all_done = tf.constant(False, dtype=tf.bool)
        core_bs_states = (best_scores, cumulative_scores, in_scores, in_hyps,
                          in_prev_hyps, in_done_hyps, bs_atten_probs)

        def LoopContinue(cur_step, all_done, unused_step_ids,
                         unused_core_bs_states, unused_other_states_list):
            return tf.math.logical_and(cur_step < max_steps,

        def LoopBody(cur_step, unused_all_done, step_ids, core_bs_states,
            (cur_step, all_done, new_step_ids, new_bs_states,
             new_other_states) = self._BeamSearchStep(
                 theta, encoder_outputs, cur_step, step_ids, core_bs_states,
                 other_states.Pack(other_states_list), num_hyps_per_beam,
                 pre_beam_search_step_callback, post_beam_search_step_callback)
            return (cur_step, all_done, new_step_ids, new_bs_states,

        flat_other_states = other_states.Flatten()
        _, _, _, final_bs_states, flat_final_other_states = tf.while_loop(
            loop_vars=(cur_step, all_done, step_ids, core_bs_states,
                              _GetShapes(flat_other_states, none_shapes=True)))
        # [target_seq_len, num_beams * num_hyps_per_beam].
        final_done_hyps = final_bs_states[5]
        final_other_states = other_states.Pack(flat_final_other_states)

        # Assume that `paddings` has shape [source_max_lengths, source_batch_size]
        # by default, and compute `encoded_seq_lengths` accordingly. This can be
        # overridden by directly passing `seq_lengths` in the `encoder_outputs`
        # NestedMap.
        encoded_seq_lengths = getattr(encoder_outputs, 'seq_lengths', None)
        if encoded_seq_lengths is None:
            source_paddings = encoder_outputs.padding
            if isinstance(source_paddings, py_utils.NestedMap):
                encoded_seq_lengths = tf.cast(
                            1.0 - tf.transpose(source_paddings.Flatten()[0]),
                            1)), tf.int32)
                encoded_seq_lengths = tf.cast(
                            1.0 -
                            tf.cast(tf.transpose(source_paddings), tf.float32),
                            1)), tf.int32)

        # [num_beams, num_hyps_per_beam].
        topk_hyps = ops.top_k_terminated_hyps(
        # [num_beams * num_hyps_per_beam, ...].
        max_seq_length = 0 if isinstance(max_steps, tf.Tensor) else max_steps
        topk_ids, topk_lens, topk_scores = ops.unpack_hyp(
            tf.reshape(topk_hyps, [-1]), max_seq_length=max_seq_length)
        # [num_beams, num_hyps_per_beam].
        topk_scores = tf.reshape(topk_scores, tf.shape(topk_hyps))

        return BeamSearchDecodeOutput(final_done_hyps, topk_hyps, topk_ids,
                                      topk_lens, topk_scores, None,
Example #6
  def _resource_apply_dense(self, grad, var):
    if grad is None:
      tf.logging.warning('Gradient is None for variable %s' % var.name)
      return []

    grad_dtype = var.dtype  # TODO(lepikhin): add to params
    grad = tf.cast(grad, grad_dtype)
    factored_dims = self._factored_dims(var.shape.as_list())
    if factored_dims:
      vr = self.get_slot(var, 'vr')
      vc = self.get_slot(var, 'vc')
      v = self.get_slot(var, 'v')
    if self._beta1:
      m = self.get_slot(var, 'm')

    cond = tf.constant(True)

    def _Upd(c, x):
      if not self._cond_is_finite:
        return c
      c = tf.math.logical_and(c, tf.reduce_all(tf.math.is_finite(x)))
      c = tf.math.logical_and(
          c, tf.reduce_all(tf.math.logical_not(tf.math.is_inf(x))))
      return c

    def _Wrap(fn, x, y):
      if not self._cond_is_finite:
        return fn(x, y)
      return tf.cond(cond, lambda: fn(x, y), lambda: x)

    with tf.variable_scope(var.name[:-2] + '/Adafactor'):
      grad_squared = tf.math.square(grad) + tf.cast(self._epsilon1, grad_dtype)
      cond = _Upd(cond, grad_squared)
      decay_rate = tf.cast(self._decay_rate, var.dtype)
      old_val = tf.identity(var)  # TODO(lepikhin): introduce gradient dtype
      if self._multiply_by_parameter_scale:
        update_scale = self._parameter_scale(old_val) * tf.cast(
            self._learning_rate, grad_dtype)
        update_scale = self._learning_rate
      mixing_rate = tf.cast(1.0 - decay_rate, grad_dtype)
      update_scale = tf.cast(update_scale, grad_dtype)
      updates = []
      if factored_dims:
        d0, d1 = factored_dims
        vr_axis, vc_axis = d0, d1
        grad_squared_row_mean = tf.reduce_mean(grad_squared, axis=vr_axis)
        grad_squared_col_mean = tf.reduce_mean(grad_squared, axis=vc_axis)
        # new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean)
        new_vr = vr * decay_rate + grad_squared_row_mean * mixing_rate
        # new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean)
        new_vc = vc * decay_rate + grad_squared_col_mean * mixing_rate
        cond = _Upd(cond, new_vr)
        cond = _Upd(cond, new_vc)
        vr_update = _Wrap(tf.assign, vr, new_vr)
        vc_update = _Wrap(tf.assign, vc, new_vc)
        updates.extend([vr_update, vc_update])
        long_term_mean = tf.reduce_mean(new_vr, -1, keepdims=True)
        r_factor = tf.math.rsqrt(new_vr / long_term_mean)
        c_factor = tf.math.rsqrt(new_vc)
        x = grad * tf.expand_dims(r_factor, vr_axis) * tf.expand_dims(
            c_factor, vc_axis)
        new_v = v * decay_rate + grad_squared * mixing_rate
        cond = _Upd(cond, new_v)
        v_update = _Wrap(tf.assign, v, new_v)
        x = grad * tf.math.rsqrt(new_v)
      if self._clipping_threshold is not None:
        clipping_denom = tf.maximum(
            tf.constant(1.0, grad_dtype),
            py_utils.ReduceRms(x) /
            tf.constant(self._clipping_threshold, grad_dtype))
        x /= clipping_denom
      subtrahend = x * update_scale
      if self._beta1:
        new_m = (
            m * tf.constant(self._beta1, dtype=grad_dtype) +
            subtrahend * tf.constant(1.0 - self._beta1, dtype=grad_dtype))
        subtrahend = new_m
        cond = _Upd(cond, new_m)
        updates.append(_Wrap(tf.assign, m, new_m))
      # It is critical to use assign_sub instead of tf.assign(var - subtrahend)
      #  for the case of bfloat16 activations, so as to avoid repeatedly
      #  rounding the slice value, which results in poor quality.
      cond = _Upd(cond, subtrahend)
      var_update = _Wrap(tf.assign_sub, var, subtrahend)
      return tf.group(*updates)
Example #7
 def ApplyClipping(self, theta, x):
     p = self.params
     if not p.cc_schedule:
         return x
     cap = tf.cast(self.cc_schedule.GetState(theta.cc_schedule), x.dtype)
     return tf.clip_by_value(x, -cap, cap)
Example #8
 def FProp(self, theta, current_step):
     p = self.params
     num_decays = tf.floor(
         tf.div(tf.cast(current_step, tf.float32),
     return tf.pow(p.decay, num_decays)
Example #9
 def FProp(self, theta, current_step):
     p = self.params
     step_num = tf.cast(current_step, tf.float32)
     learning_rate = tf.math.rsqrt(tf.maximum(step_num, p.warmup_steps))
     learning_rate *= p.multiplier
     return learning_rate
 def _CastFloats(v):
     if v is None:
         return None
     return tf.cast(
         v, py_utils.FPropDtype(p)) if v.dtype.is_floating else v
Example #11
 def FProp(self, theta, current_step):
     return self._exp(tf.cast(current_step, dtype=self.params.dtype))
Example #12
    def _Gradient(inputs, _, original_grad):

      # Compute the gradients for each loss w.r.t. the inputs.
      # TODO(jngiam): Look into whether TF dedups this computation.
      per_loss_grads = []
      for loss, _ in self._losses:
        per_loss_grad = tf.gradients(loss, self._output_tensor)[0]
        if per_loss_grad is None:
              'Loss %s did not result in a gradient during '
              'GradDrop computation.', loss)

      if not per_loss_grads:
        raise ValueError('No valid gradients for GradDrop.')

      # Multiply the gradients with the inputs.
      grads = per_loss_grads
      if p.use_input_sign_only:
        input_abs = tf.abs(
            tf.cast(tf.abs(inputs) <= p.epsilon, tf.float32) + inputs)
        grads = [grad * ((inputs) / (input_abs)) for grad in grads]
        grads = [grad * inputs for grad in grads]

      # Sum gradient over batch, assuming that batch is always on dim 0.
      if p.marginalize_batch_dim:
        grads = [tf.reduce_sum(grad, axis=0, keepdims=True) for grad in grads]

      # First discretize all gradients into their sign values.
      grad_sign_positive = [tf.cast(grad > 0.0, tf.float32) for grad in grads]
      grad_sign_negative = [tf.cast(grad < 0.0, tf.float32) for grad in grads]

      # Calculate the probability of positive gradients based on equation (1)
      # in the GradDrop paper.
      grad_abs_sum = tf.add_n([tf.abs(grad) for grad in grads])
      prob_pos = (tf.add_n(grads) / (2. * grad_abs_sum + p.epsilon))
      # Implementation of different scales for the keep function. Larger
      # scales result in steeper keep functions.
      prob_pos *= p.keep_prob_function_scale

      if p.keep_prob_function == 'sigmoid':
        # Standard sigmoid has derivative of 0.25 at 0 so the factor of 4.0
        # allows the function scale in sigmoid to be compatible with the
        # function scale in the linear case.
        prob_pos = tf.sigmoid(4.0 * prob_pos)
      elif p.keep_prob_function == 'linear':
        prob_pos += 0.5

      # The main, default mode of GradDrop. Only gradients of one sign are kept,
      # and which sign is calculated via equation (1) of the main paper.
      prob_pos = tf.cast(prob_pos >= tf.random.uniform(prob_pos.shape),
                         tf.float32) - 0.5
      grad_masks = [(gsp - gsn) * prob_pos >= 0
                    for (gsn,
                         gsp) in zip(grad_sign_negative, grad_sign_positive)]

      # This diag value gives us the percentage of grads which are kept.
      gradmask_diag = [tf.cast(gm, tf.float32) for gm in grad_masks]
      diag = tf.reduce_mean(tf.add_n(gradmask_diag) / len(grad_masks))
      summary_utils.scalar('average_grad_mask', diag)
      leak_ratios = [leak_ratio for _, leak_ratio in self._losses]
      transformed_per_loss_grads = [
          grad * (leak + (1.0 - leak) * tf.cast(grad_mask, tf.float32))
          for (leak, grad,
               grad_mask) in zip(leak_ratios, per_loss_grads, grad_masks)

      transformed_grad = tf.cast(
          tf.add_n(transformed_per_loss_grads), original_grad.dtype)

      if not p.keep_gradnorm_constant:
        return transformed_grad

      transformed_grad_norm = tf.sqrt(tf.reduce_sum(transformed_grad**2))
      original_grad_norm = tf.sqrt(tf.reduce_sum(original_grad**2))
      return transformed_grad * transformed_grad_norm / (
          original_grad_norm + p.epsilon)
Example #13
  def _TimeMask(self,
    """Applies time masking with given degree to inputs.

      inputs: Batch of input features of shape (batch_size, time_length,
        num_freq, channels).
      seq_lengths: The actual sequence lengths which mask been sampled of shape
      global_seed: an integer seed tensor for stateless random ops.
      noisify: Whether to noisify the masked out regions.
      gaussian_noise: Whether to use gaussian noise when noisifying.
      dtype: Data type.
      domain_id_index: domain id index.

      Inputs with random time masking applied.
    p = self.params

    # Get time masking parameters.
    time_mask_max_frames = p.time_mask_max_frames[domain_id_index]
    time_masks_per_frame = p.time_masks_per_frame[domain_id_index]
    use_dynamic_time_mask_max_frames = \
    multiplicity = p.time_mask_count[domain_id_index]
    max_ratio = p.time_mask_max_ratio[domain_id_index]

    # If maximum mask length is zero, do nothing.
    if ((time_mask_max_frames == 0 and not use_dynamic_time_mask_max_frames) or
        max_ratio <= 0.0):
      return inputs
    if multiplicity == 0:
      return inputs
    seq_lengths = tf.cast(seq_lengths, tf.int32)
    batch_size, time_length, _, _ = py_utils.GetShape(inputs)

    # When using dynamic time mask size, discard upper-bound on
    # maximum allowed frames for time mask.
    if use_dynamic_time_mask_max_frames:
      time_mask_max_frames = None
    # Create masks in time direction and apply.
    block_arrays = self._GetMask(

    # Non-empty random seed values are only used for testing or when using
    # stateless random ops. seed_6 and seed_7 are set separately to avoid
    # correlation of warp magnitude and origin position.
    if p.use_input_dependent_random_seed:
      seed_6 = global_seed + 6
      seed_7 = global_seed + 7
      seed_6 = p.random_seed
      seed_7 = p.random_seed

    outputs = self.EinsumBxycBxBxyc(
        inputs, block_arrays, name='einsum_formasking')
    if noisify:
      # Sample noise with standard deviation with factor * 0.1 + 0.0001
      # TODO(ngyuzh): Make sure this won't affect EOS.
      if gaussian_noise:
        stddev = 1.0
        random_uniform = _random_uniform_op(p.use_input_dependent_random_seed)
        factor = random_uniform(
            shape=(), minval=1.0, maxval=2.0, dtype=dtype, seed=seed_6)
        stddev = factor * 0.1 + 0.0001
      random_normal = _random_normal_op(p.use_input_dependent_random_seed)
      noise = random_normal(
      if p.fprop_dtype is not None and p.fprop_dtype != p.dtype:
        noise = tf.cast(noise, p.fprop_dtype)
      outputs_mask = self.EinsumBxyBxBxy(
          noise, 1.0 - block_arrays, name='einsum_fornoisymasking')
      outputs = outputs + tf.expand_dims(outputs_mask, -1)

    return outputs
Example #14
  def _ConstructWarpMatrix(self, batch_size, matrix_size, origin, destination,
                           choose_range, dtype):
    """Returns warp matrices according to origin, destination and choose_range.

    This function constructs a batch of warp matrices which maps the batch
    of origin points to the batch of destination points with fixed boundary
    coordinates at 0 and choose_range.

    The warping function, defined by the origin anchor point `origin`,
    the destination of the origin anchor point `destination` and the
    length of the domain in the warping axis `choose_range` is a piecewise
    linear map that fixes the points 0 and `choose_range` and maps
    `origin` to `destination`.

    For the warping matrix to be non-singular, destination must lie in the
    range 1<= destination <= choose_range - 1, so a destination
    out of this range is adjusted to be in this range before the warping
    matrix is constructed.

    The warping map can be explicitly written by first defining the slopes:
      1) slope_0 = origin / destination.
      2) slope_1 = (choose_range - origin) / (choose_range - destination).
      3) slope_2 = 1.0.

    Then the origin point orig_i of the mapped coordinate i is given by:
      1) i < destination: orig_i = slope_0 * i.
      2) destination <= i < choose_range:
         orig_i = slope_1 * i - (slope_1 - slope_0) * destination.
      3) i >= choose_range: orig_i = i.

    Denoting n_i = ceil(orig_i), the warp matrix element warp[i][j] is given by:
      1) j = n_i: 1 - n_i + orig_i.
      2) j = n_i - 1: n_i - orig_i.
      3) Otherwise: 0.

    Applying the warp matrix to an array of pixels, i.e.,
    warped_pixel[i] = sum_j warp[i][j] * pixel[j], one would get
    warped_pixel[i] = (n_i-orig_i) pixel[n_i-1] + (1-n_i+orig_i) pixel[n_i].

      batch_size: Batch size. Integer number.
      matrix_size: Dimension of the vector space the warp matrix is applied to.
        Integer number.
      origin: Origin anchor point for warping. Tensor of shape (batch_size,) and
        data type dtype.
      destination: Destination of the origin anchor point upon warping. Tensor
        of shape (batch_size,) and data type dtype.
      choose_range: Range within which the warp reference points must lie.
        Tensor of shape (batch_size,) data type dtype.
      dtype: Data type of origin, destination, choose_range and the output warp

      warp_matrix: An array of fixed size warp matrices with shape
      (batch_size, matrix_size, matrix_size).
    p = self.params

    # Entries of destination must be in the range
    # 1 <= destination <= choose_range - 1
    # for warp matrix to have non-singular values.
    destination = tf.minimum(tf.maximum(destination, 1.0), choose_range - 1.0)

    # Construct piece-wise linear function fixing boundary points
    # specified by zero, choose_range and matrix size and maps
    # the origin anchor point to the destination.
    destination_bc = tf.broadcast_to(destination, (matrix_size, batch_size))
    destination_bc = tf.transpose(destination_bc)
    choose_range_bc = tf.broadcast_to(choose_range, (matrix_size, batch_size))
    choose_range_bc = tf.transpose(choose_range_bc)

    # Slopes of piece-wise linear function.
    slope_0 = origin / destination
    slope_1 = (choose_range - origin) / (choose_range - destination)
    slope_2 = 1.0

    # x is a batch of origin matrices.
    # The origin matrix is the matrix such that
    # origin[i][j] = Origin coordinate of coordinate i for the warp map.
    # Denoting the destination of the origin anchor point in the
    # warp map as "dest," the origin coordinate of point i is given by:
    # 1) i < dest: slope_0 * i.
    # 2) dest <= i < choose_range: slope_1 * i - (slope_1 - slope_0) * dest.
    # 3) i >= choose_range: i.
    x = tf.broadcast_to(
        tf.cast(tf.range(matrix_size), dtype=dtype), (batch_size, matrix_size))
    x = (
        self.EinsumBBmBm(slope_0, x) +
        self.EinsumBBmBm(slope_1 - slope_0, tf.nn.relu(x - destination_bc)) +
        self.EinsumBBmBm(slope_2 - slope_1, tf.nn.relu(x - choose_range_bc)))
    x = tf.broadcast_to(x, (matrix_size, batch_size, matrix_size))
    x = tf.transpose(x, perm=[1, 2, 0])

    # y is a batch of coordinate matrices.
    # A coordinate matrix is a matrix such that
    # coordinate[i][j] = j.
    y = tf.broadcast_to(
        tf.cast(tf.range(matrix_size), dtype=dtype),
        (batch_size, matrix_size, matrix_size))
    # Warp matrix is obtained by applying hat function element-wise to (x-y).
    # Denoting the origin point of i under the warp map as orig_i,
    # and n_i = ceil(orig_i), the warp matrix element warp[i][j] is given by:
    # 1) j = n_i: 1 - n_i + orig_i.
    # 2) j = n_i - 1: n_i - orig_i.
    # 3) Otherwise: 0.
    # Applying the warp matrix to pixels, i.e.,
    # warped_pixel[i] = sum_j warp[i][j] * original_pixel[j], one would get
    # warped_pixel[i] = (n_i - orig_i) * original_pixel[n_i-1]
    #                   + (1 - n_i + orig_i) * original_pixel[n_i].
    warp_matrix = x - y
    warp_matrix = _hat(warp_matrix)
    if p.fprop_dtype is not None and p.fprop_dtype != dtype:
      warp_matrix = tf.cast(warp_matrix, p.fprop_dtype)

    return warp_matrix
Example #15
def _StepNum():
  return tf.cast(tf.train.get_or_create_global_step(), tf.float32)
Example #16
def AddAttentionSummaryBatchMajor(name,
    """Adds an image summary showing the attention probability matrix and state.

  As opposed to AddAttentionSummary() takes all tensors with batch dimension in
  axis 0.

    name: Summary name.
    attention_tensors: A list of 3D tensors shaped [batch_size, target_len,
      source_len] where attention[b, i, j] is the probability for the i-th
      output attending to the j-th input for element b in the batch.
    src_paddings: A tensor of binary paddings shaped [batch, source_len] for the
      source sequence. Or a list of tensors of the same length as
      attention_tensors with a separate paddings for each entry in
    tgt_paddings: A tensor of binary paddings shaped [batch, target_len] for the
      target sequence. Or a list of tensors of the same length as
      attention_tensors with a separate paddings for each entry in
    transcripts: Optional, transcripts shaped [batch, source_len] for the source
    max_outputs: Integer maximum number of elements of the batch to plot.
    def VerifyLen(paddings):
        length = len(paddings) if isinstance(paddings, list) else 1
        if length != 1 and length != len(attention_tensors):
            raise ValueError('Bad length of paddings list {}'.format(length))


    # Verify shapes.
    for i, attention_tensor in enumerate(attention_tensors):
        src, tgt = src_paddings, tgt_paddings
        src = src[0 if len(src) == 1 else i] if isinstance(src, list) else src
        tgt = tgt[0 if len(tgt) == 1 else i] if isinstance(tgt, list) else tgt
        tgt_shape = py_utils.GetShape(tgt)
        attention_tensors[i] = tf.identity(
                    py_utils.GetShape(attention_tensor), tgt_shape[:2] +
                    [py_utils.GetShape(src)[1]] + tgt_shape[2:])
            ], attention_tensor), re.sub(':.*$', '', attention_tensor.name))

    if not _ShouldAddSummary():

    def ToLengths(paddings):
        paddings = paddings if isinstance(paddings, list) else [paddings]
        return [SequenceLength(p) for p in paddings]

    def Get(lengths, i):
        return lengths[0 if len(lengths) == 1 else i]

    src_lens = ToLengths(src_paddings)
    tgt_lens = ToLengths(tgt_paddings)

    with plot.MatplotlibFigureSummary(name + '/Attention',
                                      gridspec_kwargs={'hspace': 0.3}) as fig:
        for n, atten in enumerate(attention_tensors):
            # Diagnostic metric that decreases as attention picks up.
            max_entropy = tf.math.log(tf.cast(Get(src_lens, n), tf.float32))
            max_entropy = tf.expand_dims(tf.expand_dims(max_entropy, -1), -1)
            atten_normalized_entropy = -atten * tf.math.log(
                atten + 1e-10) / max_entropy
            scalar(name + '/Attention/average_normalized_entropy/%d' % n,
            args = [atten, Get(src_lens, n), Get(tgt_lens, n)]
            if transcripts is not None and n == 0:
Example #17
  def try_apply_dense(self, grad, var):
    assert grad is not None

    cond = tf.constant(True)
    is_finite_checks = []
    stats = {}

    grad_dtype = var.dtype  # TODO(lepikhin): add to params
    grad = tf.cast(grad, grad_dtype)
    factored_dims = self._factored_dims(var.shape.as_list())
    if factored_dims:
      vr = self.get_slot(var, 'vr')
      vc = self.get_slot(var, 'vc')
      v = self.get_slot(var, 'v')
    if self._beta1:
      m = self.get_slot(var, 'm')

    def _Upd(c, k, x):
      stats[k] = x
      return c

    with tf.variable_scope(var.name[:-2] + '/Adafactor'):
      grad_squared = tf.math.square(grad) + tf.cast(self._epsilon1, grad_dtype)
      cond = _Upd(cond, 'grad_squared', grad_squared)  # 0 (factored)
      decay_rate = tf.cast(self._decay_rate, var.dtype)
      old_val = tf.identity(var)  # TODO(lepikhin): introduce gradient dtype
      assert self._multiply_by_parameter_scale
      if self._multiply_by_parameter_scale:
        parameter_scale = self._parameter_scale(old_val)
        cond = _Upd(cond, 'parameter_scale', parameter_scale)  # 1 (factored)
        update_scale = self._parameter_scale(old_val) * tf.cast(
            self._learning_rate, grad_dtype)

        update_scale = self._learning_rate
      mixing_rate = tf.cast(1.0 - decay_rate, grad_dtype)
      update_scale = tf.cast(update_scale, grad_dtype)
      if factored_dims:
        d0, d1 = factored_dims
        vr_axis, vc_axis = d0, d1
        grad_squared_row_mean = tf.reduce_mean(grad_squared, axis=vr_axis)
        grad_squared_col_mean = tf.reduce_mean(grad_squared, axis=vc_axis)
        # new_vr = (decay_rate * vr + mixing_rate * grad_squared_row_mean)
        new_vr = vr * decay_rate + grad_squared_row_mean * mixing_rate
        # new_vc = (decay_rate * vc + mixing_rate * grad_squared_col_mean)
        new_vc = vc * decay_rate + grad_squared_col_mean * mixing_rate
        cond = _Upd(cond, 'new_vr', new_vr)  # 2 (factored)
        cond = _Upd(cond, 'new_vc', new_vc)  # 3 (factored)
        # vr_update = _Wrap(tf.assign, vr, new_vr)
        # vc_update = _Wrap(tf.assign, vc, new_vc)
        # updates.extend([vr_update, vc_update])
        long_term_mean = tf.reduce_mean(new_vr, -1, keepdims=True)
        r_factor = tf.math.rsqrt(new_vr / long_term_mean)
        c_factor = tf.math.rsqrt(new_vc)
        mult = tf.expand_dims(r_factor, vr_axis) * tf.expand_dims(
            c_factor, vc_axis)
        cond = _Upd(cond, 'mult', mult)  # 4 (factored)
        x = grad * mult
        new_v = v * decay_rate + grad_squared * mixing_rate
        cond = _Upd(cond, 'new_v', new_v)
        # v_update = _Wrap(tf.assign, v, new_v)
        # updates.append(v_update)
        x = grad * tf.math.rsqrt(new_v)

      assert self._clipping_threshold is not None

      if self._clipping_threshold is not None:
        clipping_denom = tf.maximum(
            tf.constant(1.0, grad_dtype),
            py_utils.ReduceRms(x) /
            tf.constant(self._clipping_threshold, grad_dtype))
        x /= clipping_denom
      cond = _Upd(cond, 'x', x)
      subtrahend = x * update_scale
      if self._beta1:
        new_m = (
            m * tf.constant(self._beta1, dtype=grad_dtype) +
            subtrahend * tf.constant(1.0 - self._beta1, dtype=grad_dtype))
        subtrahend = new_m
        cond = _Upd(cond, 'new_m', new_m)
        # updates.append(_Wrap(tf.assign, m, new_m))

      # It is critical to use assign_sub instead of tf.assign(var - subtrahend)
      #  for the case of bfloat16 activations, so as to avoid repeatedly
      #  rounding the slice value, which results in poor quality.
      cond = _Upd(cond, 'subtrahend', subtrahend)  # 5 (factored)

      # var_update = _Wrap(tf.assign_sub, var, subtrahend)
      # updates.append(var_update)

      return is_finite_checks, stats
Example #18
    def _BuildMetric(self, feed_data, classid):
        """Construct tensors and the feed_dict for Waymo metric op.

      feed_data: a NestedMap returned by _GetData().
      classid: integer.

      A tuple of 3 dicts:

      - scalar_metrics: a dict mapping all the metric names to fetch tensors.
      - curves: a dict mapping all the curve names to fetch tensors.
      - feed_dict: a dict mapping the tensors in feed_tensors to feed values.
        breakdown_names = config_util.get_breakdown_names_from_config(
        if feed_data is None:
            dummy_scalar = tf.constant(np.nan)
            dummy_curve = tf.zeros(
                [self.metadata.NumberOfPrecisionRecallPoints(), 2], tf.float32)
            scalar_metrics = {
                'ap': dummy_scalar,
                'ap_ha_weighted': dummy_scalar
            curve_metrics = {'pr': dummy_curve, 'pr_ha_weighted': dummy_curve}

            for i, metric in enumerate(breakdown_names):
                scalar_metrics['ap_%s' % metric] = dummy_scalar
                scalar_metrics['ap_ha_weighted_%s' % metric] = dummy_scalar
                curve_metrics['pr_%s' % metric] = dummy_curve
                curve_metrics['pr_ha_weighted_%s' % metric] = dummy_curve
            return scalar_metrics, curve_metrics, {}

        feed_dict = {}

        f_gt_bbox = tf.placeholder(tf.float32)
        feed_dict[f_gt_bbox] = feed_data.gt.bbox

        f_gt_imgid = tf.placeholder(tf.int32)
        feed_dict[f_gt_imgid] = feed_data.gt.imgid

        f_pd_bbox = tf.placeholder(tf.float32)
        feed_dict[f_pd_bbox] = feed_data.pd.bbox

        f_pd_imgid = tf.placeholder(tf.int32)
        feed_dict[f_pd_imgid] = feed_data.pd.imgid

        f_pd_score = tf.placeholder(tf.float32)
        feed_dict[f_pd_score] = feed_data.pd.score

        num_gt_bboxes = feed_data.gt.imgid.shape[0]
        num_pd_bboxes = feed_data.pd.imgid.shape[0]
        gt_class_ids = tf.constant(classid,
        pd_class_ids = tf.constant(classid,
        ap, ap_ha, pr, pr_ha, _ = py_metrics_ops.detection_metrics(
            prediction_frame_id=tf.cast(f_pd_imgid, tf.int64),
            prediction_overlap_nlz=tf.zeros_like(f_pd_imgid, dtype=tf.bool),
            ground_truth_frame_id=tf.cast(f_gt_imgid, tf.int64),
            ground_truth_difficulty=tf.zeros_like(f_gt_imgid, dtype=tf.uint8),

        # All tensors returned by Waymo's metric op have a leading dimension
        # B=number of breakdowns. At this moment we always use B=1 to make
        # it compatible to the python code.
        scalar_metrics = {'ap': ap[0], 'ap_ha_weighted': ap_ha[0]}
        curve_metrics = {'pr': pr[0], 'pr_ha_weighted': pr_ha[0]}

        for i, metric in enumerate(breakdown_names):
            # There is a scalar / curve for every breakdown.
            scalar_metrics['ap_%s' % metric] = ap[i]
            scalar_metrics['ap_ha_weighted_%s' % metric] = ap_ha[i]
            curve_metrics['pr_%s' % metric] = pr[i]
            curve_metrics['pr_ha_weighted_%s' % metric] = pr_ha[i]
        return scalar_metrics, curve_metrics, feed_dict
Example #19
    def AssignAnchors(self,
        """Assigns anchors to bboxes using a similarity function (SSD-based).

    Each anchor box is assigned to the top matching ground truth box.
    Ground truth boxes can be assigned to multiple anchor boxes.

    Assignments can result in 3 outcomes:

      - Positive assignment (if score >= foreground_assignment_threshold):
        assigned_gt_labels will reflect the assigned box label and
        assigned_cls_mask will be set to 1.0
      - Background assignment (if score <= background_assignment_threshold):
        assigned_gt_labels will be background_class_id and assigned_cls_mask
        will be set to 1.0
      - Ignore assignment (otherwise):
        assigned_gt_labels will be background_class_id and assigned_cls_mask
        will be set to 0.0

    The detection loss function would usually:

      - Use assigned_cls_mask for weighting the classification loss. The mask
        is set such that the loss applies to foreground and background
        assignments only - ignored anchors will be set to 0.
      - Use assigned_reg_mask for weighting the regression loss. The mask is set
        such that the loss applies to foreground assignments only.

    The thresholds (foreground_assignment_threshold and
    background_assignment_threshold) should be tuned per dataset.

    TODO(jngiam): Consider having a separate threshold for regression boxes; a
    separate threshold is used in PointRCNN.

      anchor_bboxes: tf.float32. [A, 7], where [..., :] corresponds to box
        parameters (x, y, z, dx, dy, dz, r).
      gt_bboxes: tf.float32. [G, 7], where [..., :] corresponds to ground truth
        box parameters (x, y, z, dx, dy, dz, r).
      gt_bboxes_labels: tensor with shape [G]. Ground truth labels for each
        bounding box.
      gt_bboxes_mask: tensor with shape [G]. Mask for ground truth boxes, 1 iff
        the gt_bbox is a real bbox.
      foreground_assignment_threshold: Similarity score threshold for assigning
        foreground bounding boxes; scores need to be >=
        foreground_assignment_threshold to be assigned to foreground.
      background_assignment_threshold: Similarity score threshold for assigning
        background bounding boxes; scores need to be <=
        background_assignment_threshold to be assigned to background.
      background_class_id: class id to be assigned to anchors_gt_class if no
        anchor boxes match.
      force_match: Boolean specifying if force matching is enabled. If
        force matching is enabled, then matched anchors which are also the
        highest scoring with a ground-truth box are considered foreground
        matches as long as their similarity score > 0.
      similarity_fn: Function that computes the a similarity score (e.g., IOU)
        between pairs of bounding boxes. This function should take in two
        tensors corresponding to anchor and ground-truth bboxes, and return a
        matrix [A, G] with the similarity score between each pair of bboxes. The
        score must be non-negative, with greater scores representing more
        similar. The fore/background_assignment_thresholds will be applied to
        this score to determine if the an anchor is foreground, background or
        ignored. If set to None, the function will default to IOU2DRotatedBoxes.

      NestedMap with the following keys

      - assigned_gt_bbox: shape [A, 7] bbox parameters assigned to each anchor.
      - assigned_gt_similarity_score: shape [A] (iou) score between the anchor
        and the gt bbox.
      - assigned_gt_labels: shape [A] label assigned to bbox.
      - assigned_cls_mask: shape [A] mask for classification loss per anchor.
        This should be 1.0 if the anchor has a foreground or background
        assignment; otherwise, it will be assigned to 0.0.
      - assigned_reg_mask: shape [A] mask for regression loss per anchor.
        This should be 1.0 if the anchor has a foreground assignment;
        otherwise, it will be assigned to 0.0.
        Note: background anchors do not have regression targets.
        if similarity_fn is None:
            similarity_fn = self.IOU2DRotatedBoxes

        # Shape validation.
        anchor_bboxes = py_utils.HasShape(anchor_bboxes, [-1, 7])
        num_anchor_bboxes, _ = py_utils.GetShape(anchor_bboxes, 2)
        gt_bboxes = py_utils.HasShape(gt_bboxes, [-1, 7])
        num_gt_bboxes, _ = py_utils.GetShape(gt_bboxes, 2)

        # Compute similarity score and reduce max by anchors and by ground-truth.
        similarity_score = similarity_fn(anchor_bboxes, gt_bboxes)
        similarity_score = py_utils.HasShape(
            similarity_score, [num_anchor_bboxes, num_gt_bboxes])

        # Reduce over ground-truth boxes, so we have the max score per anchor.
        anchor_max_score = tf.reduce_max(similarity_score, axis=1)
        anchor_max_idx = tf.argmax(similarity_score, axis=1)

        if force_match:
            # Reduce over anchors, so we have the max score per ground truth box.
            gt_max_score = tf.reduce_max(similarity_score,

            # Force matches occur when the top matching gt bbox for an anchor is the
            # top matching anchor for the gt bbox. When force matching, we match
            # these boxes as long as their similarity score exceeds 0.
            force_matches = (
                tf.equal(similarity_score, gt_max_score)
                & tf.equal(similarity_score, anchor_max_score[..., tf.newaxis])
                & tf.greater(similarity_score, 0.)
                & tf.cast(gt_bboxes_mask[tf.newaxis, ...], tf.bool))
            force_match_indicator = tf.reduce_any(force_matches, axis=1)
            force_match_idx = tf.argmax(tf.cast(force_matches, tf.int32),

            # In assigning foreground/background anchors later, force_match_indicator
            # is used to determine which anchors are force foreground, and the index
            # assigned will be taken from anchor_max_idx.

            # Force matchers must also be the max scoring gt bbox per anchor.
            # We overwrite anchor_max_idx to ensure that the right match is done.
            anchor_max_idx = tf.where(force_match_indicator, force_match_idx,

        # Ensure that max score boxes are not padded boxes by setting score to 0
        # for boxes that are padded.
        gathered_mask = tf.batch_gather(gt_bboxes_mask, anchor_max_idx)
        anchor_max_score = tf.where(tf.equal(gathered_mask, 1),

        # Boolean tensors corresponding to whether an anchor is background or
        # foreground based on thresholding.
        background_anchors = tf.less_equal(anchor_max_score,
        foreground_anchors = tf.greater_equal(anchor_max_score,
        if force_match:
            # Background anchors are below threshold and not force matches.
            background_anchors &= ~force_match_indicator
            # Foreground anchors are above thresholds or force matches.
            foreground_anchors |= force_match_indicator

        # Add dummy background bbox to gt_boxes to facilitate batch gather.
        dummy_bbox = tf.constant([[0, 0, 0, 1, 1, 1, 0]], dtype=tf.float32)

        # Since we are concatenating the dummy bbox, the index corresponds to the
        # number of boxes.
        dummy_bbox_idx = py_utils.GetShape(gt_bboxes, 1)[0]

        gt_bboxes = tf.concat([gt_bboxes, dummy_bbox], axis=0)
        gt_bboxes_labels = tf.concat([gt_bboxes_labels, [background_class_id]],

        # Gather indices so that all foreground boxes are gathered from gt_bboxes,
        # while all background and ignore boxes gather the dummy_bbox.
        anchor_gather_idx = tf.where(
            foreground_anchors, anchor_max_idx,

        # Gather the bboxes and weights.
        assigned_gt_bbox = tf.batch_gather(gt_bboxes, anchor_gather_idx)
        assigned_gt_labels = tf.batch_gather(gt_bboxes_labels,

        # Set masks for classification and regression losses.
        assigned_cls_mask = tf.cast(background_anchors | foreground_anchors,
        assigned_reg_mask = tf.cast(foreground_anchors, tf.float32)

        return py_utils.NestedMap(
Example #20
    def ComputeAndUpdateMoments(self, theta, inputs, paddings=None, **kwargs):
        """Computes moments and updates state.

      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      inputs: The inputs tensor.  Shaped [..., dim].
      paddings: The paddings tensor.  Shaped [..., 1], with the same rank as the
        input tensor.
      **kwargs: Additional inputs.

      Tuple of (mean, variance, beta, gamma).
        p = self.params
        if paddings is None:
            paddings = self._GetDefaultPaddings(inputs)
        inputs = py_utils.with_dependencies([
            py_utils.assert_shape_match([tf.shape(paddings)[-1]], [1]),
        ], inputs)
        with tf.name_scope(p.name):
            if self.do_eval or p.freeze_bn_stats:
                # The mean and variance used for normalization.
                norm_mean, norm_variance = (self.vars.moving_mean,
                rank = tf.rank(paddings)
                reduce_over_dims = tf.range(0, rank - 1)
                mean, variance = ComputeMoments(
                    inputs, paddings, reduce_over_dims, None,

                py_utils.UpdateBatchNormVars(self.vars.moving_mean, mean,
                                             variance, self._decay)
                # Add some summaries for visualization.
                summary_utils.histogram('%s_mean' % p.name,
                                        tf.cast(mean, tf.float32))
                summary_utils.histogram('%s_variance' % p.name,
                                        tf.cast(variance, tf.float32))
                    '%s_moving_mean' % p.name,
                    tf.cast(self.vars.moving_mean, tf.float32))
                    '%s_moving_variance' % p.name,
                    tf.cast(self.vars.moving_variance, tf.float32))
                    '%s_mean_diff' % p.name,
                        tf.cast(mean, self.vars.moving_mean.dtype.base_dtype) -
                        self.vars.moving_mean, tf.float32))
                    '%s_variance_diff' % p.name,
                                self.vars.moving_variance.dtype.base_dtype) -
                        self.vars.moving_variance, tf.float32))
                if p.use_moving_avg_in_training:
                    # Use the global statistics for normalization.
                    # Control dependencies on mean and variance make sure
                    # moving_mean and variance will be updated for every training step.
                    norm_mean = py_utils.with_dependencies(
                        [mean], self.vars.moving_mean)
                    norm_variance = py_utils.with_dependencies(
                        [variance], self.vars.moving_variance)
                    # Use the batch statistics for normalization.
                    norm_mean = mean
                    norm_variance = variance

            norm_mean = py_utils.CheckNumerics(
                norm_mean, 'mean of %s failed numeric check' % p.name)
            norm_variance = py_utils.CheckNumerics(
                norm_variance, 'variance of %s failed numeric check' % p.name)

            beta, gamma = self._GetBetaGamma(theta, inputs, **kwargs)
            return norm_mean, norm_variance, beta, gamma
Example #21
    def FProp(self, theta, input_batch):
        """Embeds source ids and transforms with TransformerStack.

      theta: A `.NestedMap` object containing weights' values of this
        layer and its children layers.
      input_batch: A `.NestedMap` with fields:

        - ids: The inputs tensor. It is expected to be of shape [batch, time].
        - paddings: The paddings tensor. Expected shape [batch, time].
        - task_ids: If p.task_emb is provided, must contain per-token task
            ids of shape [batch, time].

      A NestedMap containing

      - encoded: The encoded features, either a tensor of shape
        [time, batch, depth], or a list of tensors if is_transparent is set in
      - padding: of shape [time, batch]
      - segment_id: [time, batch] if packed inputs are supported by the model
        (and all layers), or None otherwise.
      - embedded_inputs: [time, batch, depth] embedded inputs tokens without
        positional encodings.

        p = self.params
        with tf.name_scope(p.name):
            src_segment_id = None
            src_segment_pos = None
            input_ids = py_utils.with_dependencies([
                py_utils.assert_equal(tf.rank(input_batch.ids), 2)
            ], input_batch.ids)

            if (not py_utils.use_tpu()
                    and tf.flags.FLAGS.transformer_encoder_truncates_inputs):
                max_seq_length = tf.cast(
                    tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings,
                                                1)), tf.int32)
                paddings = py_utils.with_dependencies([
                        tf.constant(True, tf.bool),
                            input_batch.paddings[:, max_seq_length:] > 0.5))
                ], input_batch.paddings)
                input_ids = input_ids[:, :max_seq_length]
                paddings = paddings[:, :max_seq_length]
                if p.packed_input:
                    src_segment_id = input_batch.segment_ids[:, :
                    src_segment_pos = input_batch.segment_pos[:, :
                paddings = input_batch.paddings
                if p.packed_input:
                    src_segment_id = input_batch.segment_ids
                    src_segment_pos = input_batch.segment_pos

            max_time = tf.shape(input_ids)[1]

            # Input token embeddings + positional embeddings
            if not p.shared_emb:
                input_embs = self.token_emb.EmbLookup(
                    theta.token_emb, tf.reshape(input_ids, [-1]))
                input_embs = self.softmax.EmbLookup(
                    theta.softmax, tf.reshape(input_ids, [-1]))

            input_embs = tf.reshape(input_embs,
                                    [-1, max_time, p.token_emb.embedding_dim])
            # [time, batch, dim]
            orig_input_embs = tf.transpose(input_embs, [1, 0, 2])

            if p.packed_input:
                position_embs = self.position_emb.FPropWithPosition(
                    theta.position_emb, src_segment_pos)
                position_embs = self.position_emb.FProp(
                    theta.position_emb, max_time)
                position_embs = tf.reshape(
                    position_embs, [1, max_time, p.token_emb.embedding_dim])
            input_embs += position_embs
            if p.task_emb:
                input_embs += self.task_emb.EmbLookup(theta.task_emb,

            if p.model_dim != p.token_emb.embedding_dim:
                input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs)

            paddings = tf.cast(tf.transpose(paddings), py_utils.FPropDtype(p))
            if p.packed_input:
                src_segment_id = tf.transpose(src_segment_id)
            input_embs = self.input_dropout.FProp(theta.input_dropout,

            # [time, batch, dim]
            transformer_input = tf.transpose(input_embs, [1, 0, 2])

        if not self.do_eval and p.apply_source_mask:
            # Augment padding for masked source word positions.
            dtype = paddings.dtype
            source_mask = tf.where(tf.equal(input_ids, p.source_mask_id),
                                   tf.ones_like(input_ids, dtype=dtype),
                                   tf.zeros_like(input_ids, dtype=dtype))
            # Make sure padding is between 0 and 1.
            paddings = tf.clip_by_value(paddings + tf.transpose(source_mask),
                                        0.0, 1.0)

        encoded, padding, segment_id = self.transformer_stack.FProp(
            theta.transformer_stack, transformer_input, paddings,
        return py_utils.NestedMap(encoded=encoded,
Example #22
 def Update(self, value):
     """Adds value to the accumulator."""
     self.SetValue(self.GetValue() + tf.cast(value, self.dtype))
Example #23
    def _BeamSearchStep(self, theta, encoder_outputs, cur_step, step_ids,
                        core_bs_states, other_states, num_hyps_per_beam,
        """Extend beam search hyps for one step.

      | num_beams = Number of source sequences to be decoded.
      | num_hyps_per_beam = Number of hyps to keep per source sequence.
      | num_hyps = num_beams * num_hyps_per_beam
      | src_seq_len = Number of time steps in the source sequence.
      | src_batch = Number of examples in the source sequence.
      | tgt_seq_len = Maximum allowed time steps in the target sequence.
      | tgt_batch = num_hyps_per_beam * src_batch

      theta: A `.NestedMap` object containing weights' values of the decoder
        layer and its children layers.
      encoder_outputs: A `.NestedMap` containing encoder outputs to be passed to
        the callbacks.
      cur_step: A scalar int tensor, the current time step, 0-based.
      step_ids: An int tensor of shape [num_hyps, 1]. The input ids to the
        current search step.
      core_bs_states: A tuple of core beam search states. This list is
        maintained by this helper class.
      other_states: A `.NestedMap` of other beam search states. This
        `.NestedMap` is managed and updated by the client. It is expected that
        each of its member tensors are of rank >= 1. t[i, ...] is the state of
        the i-th hyp at the beginning of this search step.
      num_hyps_per_beam: Num of hyps to keep per beam.
      pre_beam_search_step_callback: The `PreBeamSearchStepCallback` callback.
        See class header comments for more details.
      post_beam_search_step_callback: The `PostBeamSearchStepCallback` callback.
        See class header comments for more details.

      A tuple of following elements for the next beam search step,
      (next step, all_done, step_ids, core_bs_states, other_states)
        p = self.params

        bs_results, other_states = pre_beam_search_step_callback(
            theta, encoder_outputs, step_ids, other_states, num_hyps_per_beam)

        (best_scores, cumulative_scores, in_scores, in_hyps, in_prev_hyps,
         in_done_hyps, in_atten_probs) = core_bs_states

        (out_best_scores, out_cumulative_scores, out_scores, out_hyps,
         out_prev_hyps, out_done_hyps, out_atten_probs,
         all_done) = ops.beam_search_step(
             tf.cast(bs_results.log_probs, dtype=p.dtype),
             tf.cast(bs_results.atten_probs, dtype=p.dtype),
             bs_results.is_last_chunk if self._model_uses_eoc_id else [],

        new_step_ids = tf.reshape(out_hyps[cur_step, :], tf.shape(step_ids))

        # [num_hyps_per_beam * num_beams].
        old_hyp_ids = tf.reshape(
            tf.slice(out_prev_hyps, begin=[cur_step, 0], size=[1, -1]), [-1])

        if p.batch_major_compute:
            # Transformed the indices into the key/value cache for fast decoding
            # (prefix_states in other_states) due to the num_hyps dimension of
            # cache is computed as num_beams by num_hyps_per_beam, which is different
            # from the old_hyp_ids assumption (num_hyps_per_beam by num_beams).
            # Both transpose and recomputation are required to correct the indices.
            num_beams = tf.shape(best_scores)[0]
            # [num_beams * num_hyps_per_beam].
            old_hyp_ids_in_cache_order = tf.reshape(
                tf.transpose(tf.reshape(old_hyp_ids, [num_hyps_per_beam, -1])),
            old_hyp_ids_in_cache_order = (
                (old_hyp_ids_in_cache_order % num_beams) * num_hyps_per_beam +
                old_hyp_ids_in_cache_order // num_beams)

        new_bs_states = (out_best_scores, out_cumulative_scores, out_scores,
                         out_hyps, out_prev_hyps, out_done_hyps,

        def ReOrderHyps(key, x_in):
            """Reorders x_in based on prev hyp ids."""
            correct_old_hyp_ids = (old_hyp_ids_in_cache_order
                                   if p.batch_major_compute else old_hyp_ids)
            if (isinstance(x_in, tf.Tensor) and x_in.shape.ndims):
                if x_in.shape.ndims > 2 and not p.batch_major_state:
                    # Use corrected indices only here for batch major compute as key/value
                    # caches are the states being affected.
                    x_out = tf.gather(x_in, correct_old_hyp_ids, axis=1)
                elif key in POSSIBLY_TIME_MAJOR_STATE_KEYS:
                    x_out = tf.gather(x_in, old_hyp_ids, axis=-1)
                    x_out = tf.gather(x_in, correct_old_hyp_ids)
                return x_out
                return x_in

        new_other_states = other_states.TransformWithKey(ReOrderHyps)

        final_other_states = post_beam_search_step_callback(
            theta, encoder_outputs, new_step_ids, new_other_states)

        return (cur_step + 1, all_done, new_step_ids, new_bs_states,
Example #24
    def _StreamMoments(self, inputs, paddings, cached_sum, cached_count,
        """Computes mean and variance over the valid data points in inputs.

      inputs: [B, T, F, N, G] or [B, T, N, G]
      paddings: [B, T, 1, 1, 1] or [B, T, 1, 1]
      cached_sum: [B, 1, 1, N, 1] or [B, 1, N, 1]
      cached_count: same shape as cached_sum.
      cached_var: same shape as cached_sum.

      mean: [B, T, 1, N, 1] or [B, T, N, 1]
      variance: same shape as mean.
      new_cached_sum: same shape as cached_sum.
      new_cached_count: same shape as cached_count.
        tf.logging.vlog(1, 'inputs: %r', inputs)
        tf.logging.vlog(1, 'paddings: %r', paddings)
        tf.logging.vlog(1, 'cached_sum: %r', cached_sum)
        tf.logging.vlog(1, 'cached_count: %r', cached_count)

        mask = 1.0 - paddings
        inputs *= tf.cast(mask, inputs.dtype)

        input_rank = py_utils.GetRank(inputs)
        assert input_rank is not None, (f'inputs rank must be staic for '
        reduce_over_dims = list(range(input_rank))
        # Skip B, T, and N. Reduce {F,G} or just G.
        reduce_over_dims = reduce_over_dims[2:-2] + reduce_over_dims[-1:]
        tf.logging.vlog(1, 'reduce_over_dims: %s', reduce_over_dims)

        # [B, T, 1, N, 1] or [B, T, N, 1]
        sum_v = tf.reduce_sum(inputs, reduce_over_dims, keepdims=True)
        sum_v = tf.math.cumsum(sum_v, axis=1)
        sum_v += cached_sum

        # [B, T, 1, 1, 1] or [B, T, 1, 1]
        count_v = tf.reduce_sum(mask, reduce_over_dims, keepdims=True)
        count_v = tf.math.cumsum(count_v, axis=1)
        input_shape = py_utils.GetShape(inputs)
        if input_rank == 4:
            # F * G
            multiplier = input_shape[-1] * input_shape[-3]
            # G
            multiplier = input_shape[-1]
        count_v *= multiplier
        count_v += cached_count
        count_v = tf.maximum(count_v, 1.0)

        tf.logging.vlog(1, 'sum_v: %r', sum_v)
        tf.logging.vlog(1, 'count_v: %r', count_v)

        mean = sum_v / count_v
        sum_vv = tf.reduce_sum((inputs - mean)**2 * mask,
        sum_vv = tf.math.cumsum(sum_vv, axis=1)
        sum_vv += cached_var

        cached_sum = sum_v[:, -1:]
        cached_count = count_v[:, -1:]
        cached_var = sum_vv[:, -1:]

        variance = py_utils.with_dependencies([
            py_utils.assert_greater_equal(sum_vv, tf.cast(0, sum_vv.dtype)),
        ], sum_vv / count_v)
        return mean, variance, cached_sum, cached_count, cached_var
Example #25
    def _Extract(self, features):
        p = self.params
        ri_outputs = {}
        outputs = {}
        frame_pose = tf.reshape(_Dense(features['pose']), [4, 4])
        for laser in p.cbr_laser_names + p.gbr_laser_names:
            # Extract range images.
            for returns in p.returns:
                ri_shape = tf.reshape(
                    _Dense(features['%s_%s_shape' % (laser, returns)]), [-1])
                range_image = tf.reshape(
                    _Dense(features['%s_%s' % (laser, returns)]), ri_shape)

                shape_to_check = (p.cbr_ri_shape if laser in p.cbr_laser_names
                                  else p.gbr_ri_shape)
                range_image = py_utils.HasShape(range_image, shape_to_check)

                ri_outputs['%s_%s' % (laser, returns)] = range_image

            # Extract beam inclinations and extrinsics
            outputs['%s_extrinsics' % laser] = tf.reshape(
                _Dense(features['%s_extrinsics' % laser]), [4, 4])

        # CBRs have uniform inclination
        for laser in p.cbr_laser_names:
            beam_inclination_min = tf.reshape(
                _Dense(features['%s_beam_inclination_min' % laser]), [])
            beam_inclination_max = tf.reshape(
                _Dense(features['%s_beam_inclination_max' % laser]), [])
            outputs['%s_beam_inclinations' % laser] = tf.stack(
                [beam_inclination_min, beam_inclination_max], axis=0)

        # GBRs have non-uniform inclinations defined by 64 floats.
        for laser in p.gbr_laser_names:
            outputs['%s_beam_inclinations' % laser] = tf.reshape(
                _Dense(features['%s_beam_inclinations' % laser]), [64])

        # Embed xyz onto each range image pixel.
        for laser in p.cbr_laser_names + p.gbr_laser_names:
            extrinsics = outputs['%s_extrinsics' % laser]
            inclinations = outputs['%s_beam_inclinations' % laser]
            if laser in p.cbr_laser_names:
                ri_shape = p.cbr_ri_shape

                # Convert from 2-tuple range inclination to the full range
                # via linear interpolation.
                # CBR lasers currently are always uniform inclinations specified by a
                # length 2 vector.
                height = ri_shape[0]
                min_inclination = inclinations[0]
                max_inclination = inclinations[1]
                diff = max_inclination - min_inclination
                ratio = (.5 + tf.cast(tf.range(
                    0, height), tf.float32)) / tf.cast(height, tf.float32)
                # interpolate from min to max inclination.
                inclinations = (ratio * diff) + min_inclination
                ri_shape = p.gbr_ri_shape

            pixel_pose = None
            if laser in p.gbr_laser_names:
                pixel_pose = tf.reshape(_Dense(features['%s_pose' % laser]),
                                        shape=p.gbr_ri_shape[0:2] + [4, 4])
                outputs['%s_pose' % laser] = pixel_pose

            for returns in p.returns:
                range_image = ri_outputs['%s_%s' % (laser, returns)]
                range_image = tf.reshape(range_image, ri_shape)
                range_image_mask = range_image[..., 0] >= 0
                ri_xyz = tf.cast(
                    self._XYZFromRangeImage(range_image, range_image_mask,
                                            extrinsics, inclinations,
                                            pixel_pose, frame_pose),

                # Produce the NestedMap of xyz, features, mask.
                ri_result = py_utils.NestedMap({
                    tf.cast(range_image_mask, tf.float32),

                outputs['%s_%s' % (laser, returns)] = ri_result

        return py_utils.NestedMap(outputs)
Example #26
    def _StringsToIdsImpl(self, strs, max_length, append_eos, languages):
        """Takes a tensor of strings and returns id/padding tensors.

    This generates `token_ids`, `target_ids`, and `paddings` in the format that
    is expected for tokenizers. This performs padding to a fixed length and
    appends the end-of-sentence token as appropriate.

      strs: a string Tensor.
      max_length: a python integer. The second dimension of the returned arrays.
        All sequences are padded or truncated to that length.
      append_eos: a python bool. See `BaseTokenizer` for explanation.
      languages: A vector of strings with the same length as `strs`.

      A tuple of 3 tensors:

      - token_ids: a tensor of sequences of WPM ids starting with SOS. Sequences
        always end with EOS unless the sequence exceeds the maximum length.
        Always padded with EOS.
      - target_ids: a tensor of sequences of WPM ids not starting with SOS
        but ending with EOS. Always padded with EOS.
      - paddings: a tensor of floats indicating, at each position, whether
        the corresponding position is padded.
        p = self.params
        if append_eos is None:
            append_eos = p.append_eos

        batch_size = py_utils.GetShape(strs)[0]
        token_ids_ta = tf.TensorArray(tf.int32, batch_size)
        target_ids_ta = tf.TensorArray(tf.int32, batch_size)
        paddings_ta = tf.TensorArray(tf.float32, batch_size)

        def _TokenizeOneSentence(i, strs, token_ids_ta, target_ids_ta,
            """Tokenizes a single sentence."""
            ids, _ = self._wpm_encoder.Encode(strs[i])

            if append_eos:
                ids = tf.concat([ids, [self.eos_id]], axis=0)

            # This truncates after the eos is added, so some sentences might
            # not have </s> at the end.
            token_ids_ta = token_ids_ta.write(
                py_utils.PadOrTrimTo(tf.concat([[self.sos_id], ids], axis=0),
                                     [max_length], self.eos_id))
            target_ids_ta = target_ids_ta.write(
                i, py_utils.PadOrTrimTo(ids, [max_length], self.eos_id))
            paddings_ta = paddings_ta.write(
                py_utils.PadOrTrimTo(tf.zeros_like(ids, dtype=tf.float32),
                                     [max_length], 1.))

            return i + 1, strs, token_ids_ta, target_ids_ta, paddings_ta

        _, _, token_ids_ta, target_ids_ta, paddings_ta = tf.while_loop(
            lambda i, *_: i < batch_size,
            loop_vars=(tf.constant(0, tf.int32), strs, token_ids_ta,
                       target_ids_ta, paddings_ta),

        token_ids = token_ids_ta.stack()
        target_ids = target_ids_ta.stack()
        paddings = paddings_ta.stack()

        if not p.pad_to_max_length:
            maxlen = tf.cast(
                tf.round(tf.reduce_max(tf.reduce_sum(1.0 - paddings, axis=1))),
            token_ids = token_ids[:, :maxlen]
            target_ids = target_ids[:, :maxlen]
            paddings = paddings[:, :maxlen]

        return token_ids, target_ids, paddings
Example #27
    def _StringsToIdsImpl(self, strs, max_length, append_eos, languages):
        del languages
        p = self.params
        if append_eos is None:
            append_eos = p.append_eos

        batch_size = py_utils.GetShape(strs)[0]
        token_ids_ta = tf.TensorArray(tf.int32, batch_size)
        target_ids_ta = tf.TensorArray(tf.int32, batch_size)
        paddings_ta = tf.TensorArray(tf.float32, batch_size)

        def _TokenizeOneSentence(i, text, token_ids_ta, target_ids_ta,
            """Tokenizes a single sentence."""
            if tf.is_tensor(i):
                text_i = tf.gather(text, i)
                text_i = text[i]
            ids = self._tokenizer.tokenize(text_i).merge_dims(0, -1)

            if append_eos:
                ids = tf.concat([ids, [self.eos_id]], axis=0)
            sos_ids = tf.concat([[self.sos_id], ids], axis=0)
            if p.prepend_sos:
                ids = sos_ids

            # This truncates after the EOS is added, so some sentences might
            # not have EOS at the end.
            token_ids_ta = token_ids_ta.write(
                i, py_utils.PadOrTrimTo(sos_ids, [max_length], 0))
            target_ids_ta = target_ids_ta.write(
                i, py_utils.PadOrTrimTo(ids, [max_length], 0))
            paddings_ta = paddings_ta.write(
                py_utils.PadOrTrimTo(tf.zeros_like(ids, dtype=tf.float32),
                                     [max_length], 1.))

            return i + 1, strs, token_ids_ta, target_ids_ta, paddings_ta

        _, _, token_ids_ta, target_ids_ta, paddings_ta = tf.while_loop(
            lambda i, *_: i < batch_size,
            loop_vars=(tf.constant(0, tf.int32), strs, token_ids_ta,
                       target_ids_ta, paddings_ta),

        token_ids = token_ids_ta.stack()
        target_ids = target_ids_ta.stack()
        paddings = paddings_ta.stack()

        if not p.pad_to_max_length:
            maxlen = tf.cast(
                tf.round(tf.reduce_max(tf.reduce_sum(1.0 - paddings, axis=1))),
            token_ids = token_ids[:, :maxlen]
            target_ids = target_ids[:, :maxlen]
            paddings = paddings[:, :maxlen]

        return token_ids, target_ids, paddings
Example #28
 def MaybeCastToFPropDtype(x):
     if x is not None and x.dtype == self._params.dtype:
         return tf.cast(x, self._params.fprop_dtype)
         return x
Example #29
def _ComputePaddings(ids, eos_id):
    is_eos = tf.cast(tf.equal(ids, eos_id), tf.int32)
    # eos_in_prefix[i, j] = any(ids[i, k] == eos_id for k in range(j))
    eos_in_prefix = tf.cumsum(is_eos, axis=-1, exclusive=True)
    return tf.where(tf.equal(eos_in_prefix, 0), tf.zeros_like(ids),
Example #30
  def _GetMask(self,
    """Returns fixed size multi-masks starting from random positions.

    A multi-mask is a mask obtained by applying multiple masks.

    This function when max_length is given:
      1) Sample random mask lengths less than max_length with shape
         (batch_size, multiplicity).
      2) Truncate lengths to a max of (choose_range * max_ratio),
         so that each mask is fully contained within the corresponding sequence.
      3) Random sample start points of shape (batch_size, multiplicity)
         with in (choose_range - lengths).
      4) For each batch, multiple masks (whose number is given by the
         multiplicity) are constructed.
      5) Return a mask of shape (batch_size, mask_size) where masks are
         obtained by composing the masks constructed in step 4).
         If masks_per_frame > 0, the number is given by
         min(masks_per_frame * choose_range, multiplicity).
         If not, all the masks are composed. The masked regions are set to zero.

    This function when max_length is not given:
      1) Sample random mask lengths less than (choose_range * max_ratio)
         with shape (batch_size, multiplicity).
      2) Proceed to steps 3), 4) and 5) of the above.

      batch_size: Batch size. Integer number.
      choose_range: Range within which the masked entries must lie. Tensor of
        shape (batch_size,).
      mask_size: Size of the mask. Integer number.
      global_seed: an integer seed tensor for stateless random ops.
      max_length: Maximum number of allowed consecutive masked entries. Integer
        number or None.
      masks_per_frame: Number of masks per frame. Float number. If > 0, the
        multiplicity of the mask is set to be masks_per_frame * choose_range.
      multiplicity: Maximum number of total masks. Integer number.
      dtype: Data type.
      max_ratio: Maximum portion of the entire range allowed to be masked. Float

      mask: a fixed size multi-mask starting from a random position with shape
      (batch_size, mask_size).
    p = self.params
    # Non-empty random seed values are only used for testing or when using
    # stateless random ops. seed_1 and seed_2 are set separately to avoid
    # correlation of mask size and mask position.
    if p.use_input_dependent_random_seed:
      seed_1 = global_seed + 1
      seed_2 = global_seed + 2
    elif p.random_seed:
      seed_1 = p.random_seed + 1
      seed_2 = 2 * p.random_seed
      seed_1 = p.random_seed
      seed_2 = p.random_seed
    # Sample lengths for multiple masks.
    if max_length and max_length > 0:
      max_length = tf.broadcast_to(tf.cast(max_length, dtype), (batch_size,))
      max_length = tf.cast(choose_range, dtype=dtype) * max_ratio
    random_uniform = _random_uniform_op(p.use_input_dependent_random_seed)
    masked_portion = random_uniform(
        shape=(batch_size, multiplicity),
    masked_frame_size = self.EinsumBBmBm(max_length, masked_portion)
    masked_frame_size = tf.cast(masked_frame_size, dtype=tf.int32)
    # Make sure the sampled length was smaller than max_ratio * length_bound.
    # Note that sampling in this way was biased
    # (shorter sequence may over-masked.)
    choose_range = tf.expand_dims(choose_range, -1)
    choose_range = tf.tile(choose_range, [1, multiplicity])
    length_bound = tf.cast(choose_range, dtype=dtype)
    length_bound = tf.cast(max_ratio * length_bound, dtype=tf.int32)
    length = tf.minimum(masked_frame_size, tf.maximum(length_bound, 1))

    # Choose starting point.
    random_start = random_uniform(
        shape=(batch_size, multiplicity), maxval=1.0, seed=seed_2)
    start_with_in_valid_range = random_start * tf.cast(
        (choose_range - length + 1), dtype=dtype)
    start = tf.cast(start_with_in_valid_range, tf.int32)
    end = start + length - 1

    # Shift starting and end point by small value.
    delta = tf.constant(0.1)
    start = tf.expand_dims(tf.cast(start, dtype) - delta, -1)
    start = tf.tile(start, [1, 1, mask_size])
    end = tf.expand_dims(tf.cast(end, dtype) + delta, -1)
    end = tf.tile(end, [1, 1, mask_size])

    # Construct pre-mask of shape (batch_size, multiplicity, mask_size).
    diagonal = tf.expand_dims(
        tf.expand_dims(tf.cast(tf.range(mask_size), dtype=dtype), 0), 0)
    diagonal = tf.tile(diagonal, [batch_size, multiplicity, 1])
    pre_mask = tf.cast(
        tf.math.logical_and(diagonal < end, diagonal > start), dtype=dtype)

    # Sum masks with appropriate multiplicity.
    if masks_per_frame > 0:
      multiplicity_weights = tf.tile(
          tf.expand_dims(tf.range(multiplicity, dtype=dtype), 0),
          [batch_size, 1])
      multiplicity_tensor = masks_per_frame * tf.cast(choose_range, dtype=dtype)
      multiplicity_weights = tf.cast(
          multiplicity_weights < multiplicity_tensor, dtype=dtype)
      pre_mask = self.EinsumBmtBmBt(pre_mask, multiplicity_weights)
      pre_mask = tf.reduce_sum(pre_mask, 1)
    mask = tf.cast(1.0 - tf.cast(pre_mask > 0, dtype=dtype), dtype=dtype)

    if p.fprop_dtype is not None and p.fprop_dtype != p.dtype:
      mask = tf.cast(mask, p.fprop_dtype)

    return mask