def _Moments(inputs, mask, enable_cross_replica_sum_on_tpu=False):
        """Computes mean and variance over the valid data points in inputs."""
        inputs = py_utils.with_dependencies([
            py_utils.assert_equal(tf.rank(inputs), tf.rank(mask)),
            py_utils.assert_greater_equal(mask, tf.zeros_like(mask)),
        ], inputs)
        rank = tf.rank(mask)
        reduce_over_dims = tf.range(0, rank - 1)
        sum_v = tf.reduce_sum(inputs * tf.cast(mask, inputs.dtype),
        count_v = tf.reduce_sum(mask, reduce_over_dims)
        # Input shape is guaranteed to be a multiple of mask shape because the
        # inputs * mask op above was successfully broadcasted.
        mask_multiplier = tf.shape(inputs)[:-1] // tf.shape(mask)[:-1]
        count_v *= tf.cast(tf.reduce_prod(mask_multiplier), count_v.dtype)
        if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
            sum_v = tf.tpu.cross_replica_sum(sum_v)
            count_v = tf.tpu.cross_replica_sum(count_v)

        count_v = tf.maximum(count_v, 1.0)
        mean = sum_v / count_v
        sum_vv = tf.reduce_sum((inputs - mean) * (inputs - mean) * mask,

        if py_utils.use_tpu() and enable_cross_replica_sum_on_tpu:
            sum_vv = tf.tpu.cross_replica_sum(sum_vv)

        variance = py_utils.with_dependencies([
            py_utils.assert_greater_equal(sum_vv, tf.zeros_like(sum_vv)),
        ], sum_vv / count_v)
        return mean, variance
Example #2
 def FProp(self, theta, current_step):
     """Returns the current learning rate decay."""
     p = self.params
     current_step = tf.cast(current_step, tf.float32)
     warmup_steps = tf.cast(p.warmup_steps, tf.float32)
     linear_warmup = tf.minimum(1.0, current_step / warmup_steps)
     rsqrt_decay = tf.math.rsqrt(tf.maximum(current_step, warmup_steps))
     return p.model_dim**-0.5 * linear_warmup * rsqrt_decay
Example #3
 def Update(self, new_value):
     state0 = self.GetValue()
     state1 = tf.stack([
         state0[0] + new_value[0],
         tf.minimum(state0[1], new_value[1]),
         tf.maximum(state0[2], new_value[2]),
Example #4
    def FProp(self, theta, current_step):
        p = self.params
        with tf.name_scope(

            steps = self._best_step
            best_step = steps[0]
            last_step = steps[1]

            ref_step = tf.maximum(self._ref_step, best_step)
            f = self._cur_factor

            # Decay if no improvement within window.
            new_factor = tf.where(last_step - ref_step < p.window, f,
                                  tf.maximum(p.min_factor, f * p.decay))
            # Update ref_step if we decayed.
            new_step = tf.where(tf.equal(new_factor, f), ref_step, last_step)
            update_step = tf.assign(self._ref_step, new_step)
            with tf.control_dependencies([update_step]):
                return tf.assign(self._cur_factor, new_factor)
Example #5
        def _DerivePaddingsAndIds(src_ids, tgt_labels):
            """tgt_ids is tgt_labels shifted right by one, with a SOS ID prepended."""
            tgt_ids = tf.concat([[p.sos_id], tgt_labels[:-1]], axis=0)
            src_paddings = tf.zeros(tf.shape(src_ids), dtype=tf.float32)
            tgt_paddings = tf.zeros(tf.shape(tgt_ids), dtype=tf.float32)
            tgt_weights = tf.ones(tf.shape(tgt_ids), dtype=tf.float32)

            bucket_key = tf.cast(
                tf.maximum(tf.reduce_sum(1.0 - src_paddings),
                           tf.reduce_sum(1.0 - tgt_paddings)), tf.int32)

            return src_paddings, tgt_ids, tgt_paddings, tgt_weights, bucket_key
Example #6
 def GetTensorRange(self, t_name, ts):
     # Always straddle a real zero point.
     if self.do_eval:
         # At eval/inference time, use the memorized range.
         # Important: Don't capture these variables in training mode so as to
         # avoid extra/unnecessary captures.
         min_var = tf.stop_gradient(self._GetQStateVar(t_name, 'min'))
         max_var = tf.stop_gradient(self._GetQStateVar(t_name, 'max'))
         return (min_var, max_var)
     # Calculate min/max for all tensors.
     batch_min = tf.minimum(tf.reduce_min(ts), 0.0)
     batch_max = tf.maximum(tf.reduce_max(ts), 0.0)
     return (tf.stop_gradient(batch_min), tf.stop_gradient(batch_max))
            def ApplyBias():
                """Bias and update log_probs and consistent."""
                def TileForBeamAndFlatten(tensor):
                    tensor = tf.reshape(tensor, [1, -1])  # [1, src_batch]
                    tensor = tf.tile(tensor,
                                     [num_hyps_per_beam, 1
                                      ])  # [num_hyps_per_beam, src_batch]
                    tgt_batch = tf.shape(step_ids)[
                        0]  # num_hyps_per_beam*src_batch
                    return tf.reshape(tensor, [tgt_batch])

                # Consistent if step_ids == labels from previous step
                # TODO(navari): Consider updating consistent only if weights > 0. Then
                # re-evaluate the need for bias_only_if_consistent=True.
                # Note that prev_label is incorrrect for step 0 but is overridden later
                prev_label = TileForBeamAndFlatten(
                    tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1))
                is_step0 = tf.equal(time_step, 0)
                local_consistence = tf.math.logical_or(
                    is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1)))
                consistent = tf.math.logical_and(states.consistent,

                # get label, weight slices corresponding to current time_step
                label = TileForBeamAndFlatten(
                    tf.gather(labels, time_step, axis=1))
                weight = TileForBeamAndFlatten(
                    tf.gather(weights, time_step, axis=1))
                if p.bias_only_if_consistent:
                    weight = weight * tf.cast(consistent, p.dtype)

                # convert from dense label to sparse label probs
                vocab_size = tf.shape(bs_results.log_probs)[1]
                uncertainty = tf.constant(
                    p.dtype)  # avoid 0 probs which may cause issues with log
                label_probs = tf.one_hot(
                    on_value=1 - uncertainty,
                    off_value=uncertainty / tf.cast(vocab_size - 1, p.dtype),
                    dtype=p.dtype)  # [tgt_batch, vocab_size]
                pred_probs = tf.exp(bs_results.log_probs)

                # interpolate predicted probs and label probs
                weight = tf.expand_dims(weight, 1)
                probs = py_utils.with_dependencies([
                    py_utils.assert_less_equal(weight, 1.),
                    py_utils.assert_greater_equal(weight, 0.)
                ], (1.0 - weight) * pred_probs + weight * label_probs)
                return tf.math.log(probs), consistent
Example #8
    def QuantizeTensors(self, t_name, ts, eval_only=False):
        p = self.params
        # Always straddle a real zero point.
        if self.do_eval:
            # At eval/inference time, use the memorized range.
            # Important: Don't capture these variables in training mode so as to
            # avoid extra/unnecessary captures.
            min_var = self._GetQStateVar(t_name, 'min')
            max_var = self._GetQStateVar(t_name, 'max')
            return [
                self._MaybeFakeQuant(t, min_var, max_var, num_bits=p.bits)
                for t in ts
            # At training time, use the batch calculated min/max.
            accumulator_name = self._GetAccumulatorNameForTensor(t_name)
            # Calculate min/max for all tensors.
            batch_min = 0.0
            batch_max = 0.0
            for t in ts:
                batch_min = tf.minimum(tf.reduce_min(t), batch_min)
                batch_max = tf.maximum(tf.reduce_max(t), batch_max)

            # New state.
            state1 = tf.stack([1.0, batch_min, batch_max])

            # Results.
            ts_out = []
            for i, t in enumerate(ts):
                if eval_only:
                    # If only quantizing at eval time, still record ranges as above
                    # but don't quantize.
                    quant_t = t
                    # If quantizing during training, skip quantization if it produces
                    # NANs. Sometimes early in the training process, things are unstable
                    # and ranges can produce numerical instability that makes it
                    # impossible to perform a fake_quant.
                    quant_t = self._MaybeFakeQuant(t,
                    # TODO(laurenzo): Plumb quant_t_has_nans through state and report.
                    quant_t_has_nans = tf.math.is_nan(quant_t)
                    quant_t = tf.where(quant_t_has_nans, t, quant_t)
                    '%s/%s_%d' % (, t_name, i), t)
            return ts_out
Example #9
    def FProp(self, theta, current_step):
        p = self.params
        current_step = tf.cast(current_step, tf.int64)
        interval_starts = [0] + p.boundaries
        values = []
        for interval_start, schedule, schedule_theta in zip(
                interval_starts, self.schedules, theta.schedules):
            relative_step = tf.maximum(
                tf.cast(0, current_step.dtype),
                current_step - tf.cast(interval_start, current_step.dtype))
            values.append(schedule.FProp(schedule_theta, relative_step))

        return py_utils.PiecewiseConstant(current_step, p.boundaries, values,
Example #10
  def FProp(self, theta, inputs, paddings):
    """Apply global spatial pooling to inputs.

      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      inputs: The inputs tensor. It is expected to be of shape [batch, time,
        frequency, channel]. The time dimension corresponds to the height
        dimension as in images and the frequency dimension corresponds to the
        width dimension as in images.
      paddings: The paddings tensor. It is expected to be of shape [batch,
        time]. Defaults to None, which means there no paddings.

      outputs, out_paddings pair.
       - outputs: has shape [batch, 1, 1, channel].
       - out_paddings: None or has shape [batch, 1].
    p = self.params
    assert p.pooling_type in ['MAX', 'AVG'], p.pooling_type
    b, t, f = py_utils.GetShape(inputs, ndims=3)

    if paddings is not None:
      paddings = py_utils.HasShape(paddings, [b, t])

    if paddings is not None:
      mask = 1.0 - paddings[..., tf.newaxis, tf.newaxis]
      mask = tf.ones([b, t, 1, 1], p.dtype)
    if p.pooling_type == 'AVG':
      global_sum = tf.reduce_sum(inputs * mask, axis=[1, 2], keepdims=True)
      f = tf.cast(tf.convert_to_tensor(f), p.dtype)
      count = f * tf.reduce_sum(mask, axis=[1, 2], keepdims=True)
      out_feature = global_sum / tf.maximum(1.0, count)
    elif p.pooling_type == 'MAX':
      large_negative = (
          tf.ones_like(inputs) * p.dtype.max * tf.constant(-0.7, dtype=p.dtype))
      padded_inputs = tf.where_v2(mask > 0.0, inputs, large_negative)
      out_feature = tf.reduce_max(padded_inputs, axis=[1, 2], keepdims=True)
    if paddings is None:
      out_paddings = None
      out_paddings = tf.reduce_min(paddings, axis=1, keepdims=True)
      out_feature *= 1.0 - out_paddings[..., tf.newaxis, tf.newaxis]
    return out_feature, out_paddings
Example #11
 def Proc(record):
     """Parses a serialized tf.Example record."""
     outputs = [
     features =, dict(outputs))
     for k, v in six.iteritems(features):
         features[k] = v.values
     bucket_key = tf.cast(
         tf.maximum(tf.reduce_sum(1.0 - features['source_padding']),
                    tf.reduce_sum(1.0 - features['target_padding'])),
     return [features[k] for k, _ in outputs], bucket_key
Example #12
 def QuantizeWeight(self, w):
     p = self.params
     w_min = tf.reduce_min(w)
     w_max = tf.reduce_max(w)
     # NOTE: We force a small, non-zero range because otherwise, zero weights
     # can cause downstream inference engines to blow up.
     w_min = tf.minimum(w_min, -p.quantize_weight_epsilon)
     w_max = tf.maximum(w_max, p.quantize_weight_epsilon)
     quant_w = self._MaybeFakeQuant(w, w_min, w_max, num_bits=p.bits)
     if self.do_eval:
         return quant_w
         # If quantizing during training, skip quantization if it produces
         # NANs. Sometimes early in the training process, things are unstable
         # and ranges can produce numerical instability that makes it
         # impossible to perform a fake_quant.
         quant_w_has_nans = tf.math.is_nan(quant_w)
         return tf.where(quant_w_has_nans, w, quant_w)
def SequenceTrimLastToken(x, x_paddings):
    """Trims the last token off of sequence `x`, and set trimmed elements to 0.

    x: A sequence of tokens of shape [batch_size, x_len_max].
    x_paddings: The paddings of `x`.

    A tuple.
      - The new sequence, Tensor of shape [batch_size, x_len_max].
      - The new paddings, Tensor of shape [batch_size, x_len_max].
    x_len = tf.reduce_sum(1 - x_paddings, 1)
    x_len_max = py_utils.GetShape(x)[1]
    x_trimmed_len = tf.maximum(x_len - 1, 0)
    x_trimmed_paddings = tf.sequence_mask(x_trimmed_len, x_len_max,
    x_trimmed = x * tf.cast(x_trimmed_paddings, x.dtype)
    return x_trimmed, 1 - x_trimmed_paddings
Example #14
    def GetState(self, theta):
        """Gets the state from theta."""
        p = self.params
        if p.is_inference:
            # State is not used for inference. Just return dummy.
            return tf.zeros([1], tf.float32)
            # Calculations/vars need to be float but these can be ints in the params.
            clip_end_step = tf.cast(p.clip_end_step, tf.float32)
            clip_start_step = tf.cast(p.clip_start_step, tf.float32)
            quant_start_step = tf.cast(p.quant_start_step, tf.float32)
            global_step = tf.cast(theta.global_step, tf.float32)

            # Will be negative if before clipping starts.
            clip_ratio = (tf.minimum(clip_end_step - clip_start_step,
                                     global_step - clip_start_step) /
                          tf.maximum(1.0, clip_end_step - clip_start_step))
            # Currently fq is either on (1.0) or off (-1.0). Progressive quantization
            # may later occupy 0..1.0.
            fq_ratio = tf.where(global_step < quant_start_step, -1.0, 1.0)

            return tf.stack([clip_ratio, fq_ratio])
Example #15
    def _RecordTensor(self, t_name):
        p = self.params
        if self.do_eval:
            return []

        accumulator_name = self._GetAccumulatorNameForTensor(t_name)
        accumulator = self.accumulators[accumulator_name]
        min_var = self._GetQStateVar(t_name, 'min')
        max_var = self._GetQStateVar(t_name, 'max')

        # Unpack state tensor.
        current_value = accumulator.GetValue()
        count = current_value[0]
        min_value = current_value[1]
        max_value = current_value[2]

        def Ema(variable, value):
            return (1.0 - p.ema_decay) * (variable - value)

        # Note that small floating point issues can cause ranges that naturally
        # begin or end at zero to move slightly past, causing hard failures
        # downstream (checks that all ranges straddle zero). We therefore repeat
        # the straddling constraint here.
        return [
                    0., min_var -
                    tf.where(count > 0., Ema(min_var, min_value), 0.))),
                    0., max_var -
                    tf.where(count > 0., Ema(max_var, max_value), 0.))),
    def _ConstructWarpMatrix(self, batch_size, matrix_size, origin,
                             destination, choose_range, dtype):
        """Returns warp matrices according to origin, destination and choose_range.

    This function constructs a batch of warp matrices which maps the batch
    of origin points to the batch of destination points with fixed boundary
    coordinates at 0 and choose_range.

    The warping function, defined by the origin anchor point `origin`,
    the destination of the origin anchor point `destination` and the
    length of the domain in the warping axis `choose_range` is a piecewise
    linear map that fixes the points 0 and `choose_range` and maps
    `origin` to `destination`.

    For the warping matrix to be non-singular, destination must lie in the
    range 1<= destination <= choose_range - 1, so a destination
    out of this range is adjusted to be in this range before the warping
    matrix is constructed.

    The warping map can be explicitly written by first defining the slopes:
      1) slope_0 = origin / destination.
      2) slope_1 = (choose_range - origin) / (choose_range - destination).
      3) slope_2 = 1.0.

    Then the origin point orig_i of the mapped coordinate i is given by:
      1) i < destination: orig_i = slope_0 * i.
      2) destination <= i < choose_range:
         orig_i = slope_1 * i - (slope_1 - slope_0) * destination.
      3) i >= choose_range: orig_i = i.

    Denoting n_i = ceil(orig_i), the warp matrix element warp[i][j] is given by:
      1) j = n_i: 1 - n_i + orig_i.
      2) j = n_i - 1: n_i - orig_i.
      3) Otherwise: 0.

    Applying the warp matrix to an array of pixels, i.e.,
    warped_pixel[i] = sum_j warp[i][j] * pixel[j], one would get
    warped_pixel[i] = (n_i-orig_i) pixel[n_i-1] + (1-n_i+orig_i) pixel[n_i].

      batch_size: Batch size. Integer number.
      matrix_size: Dimension of the vector space the warp matrix is applied to.
        Integer number.
      origin: Origin anchor point for warping. Tensor of shape (batch_size,) and
        data type dtype.
      destination: Destination of the origin anchor point upon warping. Tensor
        of shape (batch_size,) and data type dtype.
      choose_range: Range within which the warp reference points must lie.
        Tensor of shape (batch_size,) data type dtype.
      dtype: Data type of origin, destination, choose_range and the output warp

      warp_matrix: An array of fixed size warp matrices with shape
      (batch_size, matrix_size, matrix_size).
        p = self.params

        # Entries of destination must be in the range
        # 1 <= destination <= choose_range - 1
        # for warp matrix to have non-singular values.
        destination = tf.minimum(tf.maximum(destination, 1.0),
                                 choose_range - 1.0)

        # Construct piece-wise linear function fixing boundary points
        # specified by zero, choose_range and matrix size and maps
        # the origin anchor point to the destination.
        destination_bc = tf.broadcast_to(destination,
                                         (matrix_size, batch_size))
        destination_bc = tf.transpose(destination_bc)
        choose_range_bc = tf.broadcast_to(choose_range,
                                          (matrix_size, batch_size))
        choose_range_bc = tf.transpose(choose_range_bc)

        # Slopes of piece-wise linear function.
        slope_0 = origin / destination
        slope_1 = (choose_range - origin) / (choose_range - destination)
        slope_2 = 1.0

        # x is a batch of origin matrices.
        # The origin matrix is the matrix such that
        # origin[i][j] = Origin coordinate of coordinate i for the warp map.
        # Denoting the destination of the origin anchor point in the
        # warp map as "dest," the origin coordinate of point i is given by:
        # 1) i < dest: slope_0 * i.
        # 2) dest <= i < choose_range: slope_1 * i - (slope_1 - slope_0) * dest.
        # 3) i >= choose_range: i.
        x = tf.broadcast_to(tf.cast(tf.range(matrix_size), dtype=dtype),
                            (batch_size, matrix_size))
        x = (self.EinsumBBmBm(slope_0, x) + self.EinsumBBmBm(
            slope_1 - slope_0, tf.nn.relu(x - destination_bc)) +
             self.EinsumBBmBm(slope_2 - slope_1,
                              tf.nn.relu(x - choose_range_bc)))
        x = tf.broadcast_to(x, (matrix_size, batch_size, matrix_size))
        x = tf.transpose(x, perm=[1, 2, 0])

        # y is a batch of coordinate matrices.
        # A coordinate matrix is a matrix such that
        # coordinate[i][j] = j.
        y = tf.broadcast_to(tf.cast(tf.range(matrix_size), dtype=dtype),
                            (batch_size, matrix_size, matrix_size))
        # Warp matrix is obtained by applying hat function element-wise to (x-y).
        # Denoting the origin point of i under the warp map as orig_i,
        # and n_i = ceil(orig_i), the warp matrix element warp[i][j] is given by:
        # 1) j = n_i: 1 - n_i + orig_i.
        # 2) j = n_i - 1: n_i - orig_i.
        # 3) Otherwise: 0.
        # Applying the warp matrix to pixels, i.e.,
        # warped_pixel[i] = sum_j warp[i][j] * original_pixel[j], one would get
        # warped_pixel[i] = (n_i - orig_i) * original_pixel[n_i-1]
        #                   + (1 - n_i + orig_i) * original_pixel[n_i].
        warp_matrix = x - y
        warp_matrix = _hat(warp_matrix)
        if p.fprop_dtype is not None and p.fprop_dtype != dtype:
            warp_matrix = tf.cast(warp_matrix, p.fprop_dtype)

        return warp_matrix
    def _GetWarpMatrix(self,
        """Returns warp matrices starting from random positions.

    In this function when max_warp_frames != None:
      1) Sample random warp displacements from the interval
         [-max_warp_frames, max_warp_frames) to yield shift tensor
         with shape (batch_size,).
      2) Truncate lengths to a maximum magnitude of (choose_range * max_ratio),
         so that each shift is fully contained within the
         corresponding sequence.
      3) Random sample origin points of shape (batch_size, multiplicity)
         with in [shift, choose_range - shift).
      4) Return a batch of 1-D linear maps that fix the boundary points and
         shift the origin point by the shift.

    When max_warp_frames == None:
      1) Sample random warp displacements with magnitudes less than
         (choose_range * max_ratio) to yield shift tensor with
         shape (batch_size,).
      2) Proceed through steps 3), 4).

      batch_size: Batch size. Integer number.
      choose_range: Range within which the warp reference points must lie.
        Tensor of shape (batch_size,).
      matrix_size: Dimension of vector space warp matrix is applied to. Integer
      global_seed: an integer seed tensor for stateless random ops.
      max_warp_frames: Upper-bound on the warp distance. Integer or None.
      dtype: Data type.
      max_ratio: Maximum ratio between the shift distance and choose_range.
        Float number.

      warp_matrix: An array of fixed size warp matrices with shape
      (batch_size, matrix_size, matrix_size).
        p = self.params
        # Non-empty random seed values are only used for testing or when using
        # stateless random ops. seed_3, seed_4, and seed_5 are set separately to
        # avoid correlation of warp magnitude and origin position.
        if p.use_input_dependent_random_seed:
            seed_3 = global_seed + 3
            seed_4 = global_seed + 4
            seed_5 = global_seed + 5
        elif p.random_seed:
            seed_3 = p.random_seed - 1
            seed_4 = p.random_seed - 1
            seed_5 = 2 * p.random_seed + 1
            seed_3 = p.random_seed
            seed_4 = p.random_seed
            seed_5 = p.random_seed

        choose_range_dtype = tf.cast(choose_range, dtype=dtype)
        length_upper_bound = tf.cast(max_ratio * choose_range_dtype,
        # Set shift length.

        random_uniform = _random_uniform_op(p.use_input_dependent_random_seed)

        if max_warp_frames and max_warp_frames > 0:
            shift = random_uniform(shape=(batch_size, ),
                                   minval=-1 * max_warp_frames,
                                   maxval=max_warp_frames + 1,
            random_ratio = random_uniform(shape=(batch_size, ),
            shift = tf.cast(
                random_ratio * tf.cast(length_upper_bound, dtype=dtype),
        # Make sure the sampled length was smaller than max_ratio * length_bound.
        # Note that sampling in this way is biased.
        # (Shorter sequence may over-masked.)
        final_shift = tf.maximum(-length_upper_bound,
                                 tf.minimum(shift, length_upper_bound))
        # Choose origin anchor point.
        mid_range = tf.cast(choose_range, dtype=tf.int32)
        mid_range = tf.maximum(choose_range - 2, 0)
        random_origin = random_uniform(shape=(batch_size, ),
        origin_with_in_valid_range = random_origin * tf.cast(mid_range,
        origin = tf.cast(origin_with_in_valid_range, tf.int32) + 1
        # Set destination point of the origin anchor point under the warp map.
        destination = origin + final_shift
        # Cast origin and destination.
        origin = tf.cast(origin, dtype=dtype)
        destination = tf.cast(destination, dtype=dtype)

        return self._ConstructWarpMatrix(batch_size=batch_size,
    def _GetMask(self,
        """Returns fixed size multi-masks starting from random positions.

    A multi-mask is a mask obtained by applying multiple masks.

    This function when max_length is given:
      1) Sample random mask lengths less than max_length with shape
         (batch_size, multiplicity).
      2) Truncate lengths to a max of (choose_range * max_ratio),
         so that each mask is fully contained within the corresponding sequence.
      3) Random sample start points of shape (batch_size, multiplicity)
         with in (choose_range - lengths).
      4) For each batch, multiple masks (whose number is given by the
         multiplicity) are constructed.
      5) Return a mask of shape (batch_size, mask_size) where masks are
         obtained by composing the masks constructed in step 4).
         If masks_per_frame > 0, the number is given by
         min(masks_per_frame * choose_range, multiplicity).
         If not, all the masks are composed. The masked regions are set to zero.

    This function when max_length is not given:
      1) Sample random mask lengths less than (choose_range * max_ratio)
         with shape (batch_size, multiplicity).
      2) Proceed to steps 3), 4) and 5) of the above.

      batch_size: Batch size. Integer number.
      choose_range: Range within which the masked entries must lie. Tensor of
        shape (batch_size,).
      mask_size: Size of the mask. Integer number.
      global_seed: an integer seed tensor for stateless random ops.
      max_length: Maximum number of allowed consecutive masked entries. Integer
        number or None.
      masks_per_frame: Number of masks per frame. Float number. If > 0, the
        multiplicity of the mask is set to be masks_per_frame * choose_range.
      multiplicity: Maximum number of total masks. Integer number.
      dtype: Data type.
      max_ratio: Maximum portion of the entire range allowed to be masked. Float

      mask: a fixed size multi-mask starting from a random position with shape
      (batch_size, mask_size).
        p = self.params
        # Non-empty random seed values are only used for testing or when using
        # stateless random ops. seed_1 and seed_2 are set separately to avoid
        # correlation of mask size and mask position.
        if p.use_input_dependent_random_seed:
            seed_1 = global_seed + 1
            seed_2 = global_seed + 2
        elif p.random_seed:
            seed_1 = p.random_seed + 1
            seed_2 = 2 * p.random_seed
            seed_1 = p.random_seed
            seed_2 = p.random_seed
        # Sample lengths for multiple masks.
        if max_length and max_length > 0:
            max_length = tf.broadcast_to(tf.cast(max_length, dtype),
                                         (batch_size, ))
            max_length = tf.cast(choose_range, dtype=dtype) * max_ratio
        random_uniform = _random_uniform_op(p.use_input_dependent_random_seed)
        masked_portion = random_uniform(shape=(batch_size, multiplicity),
        masked_frame_size = self.EinsumBBmBm(max_length, masked_portion)
        masked_frame_size = tf.cast(masked_frame_size, dtype=tf.int32)
        # Make sure the sampled length was smaller than max_ratio * length_bound.
        # Note that sampling in this way was biased
        # (shorter sequence may over-masked.)
        choose_range = tf.expand_dims(choose_range, -1)
        choose_range = tf.tile(choose_range, [1, multiplicity])
        length_bound = tf.cast(choose_range, dtype=dtype)
        length_bound = tf.cast(max_ratio * length_bound, dtype=tf.int32)
        length = tf.minimum(masked_frame_size, tf.maximum(length_bound, 1))

        # Choose starting point.
        random_start = random_uniform(shape=(batch_size, multiplicity),
        start_with_in_valid_range = random_start * tf.cast(
            (choose_range - length + 1), dtype=dtype)
        start = tf.cast(start_with_in_valid_range, tf.int32)
        end = start + length - 1

        # Shift starting and end point by small value.
        delta = tf.constant(0.1)
        start = tf.expand_dims(tf.cast(start, dtype) - delta, -1)
        start = tf.tile(start, [1, 1, mask_size])
        end = tf.expand_dims(tf.cast(end, dtype) + delta, -1)
        end = tf.tile(end, [1, 1, mask_size])

        # Construct pre-mask of shape (batch_size, multiplicity, mask_size).
        diagonal = tf.expand_dims(
            tf.expand_dims(tf.cast(tf.range(mask_size), dtype=dtype), 0), 0)
        diagonal = tf.tile(diagonal, [batch_size, multiplicity, 1])
        pre_mask = tf.cast(tf.math.logical_and(diagonal < end,
                                               diagonal > start),

        # Sum masks with appropriate multiplicity.
        if masks_per_frame > 0:
            multiplicity_weights = tf.tile(
                tf.expand_dims(tf.range(multiplicity, dtype=dtype), 0),
                [batch_size, 1])
            multiplicity_tensor = masks_per_frame * tf.cast(choose_range,
            multiplicity_weights = tf.cast(
                multiplicity_weights < multiplicity_tensor, dtype=dtype)
            pre_mask = self.EinsumBmtBmBt(pre_mask, multiplicity_weights)
            pre_mask = tf.reduce_sum(pre_mask, 1)
        mask = tf.cast(1.0 - tf.cast(pre_mask > 0, dtype=dtype), dtype=dtype)

        if p.fprop_dtype is not None and p.fprop_dtype != p.dtype:
            mask = tf.cast(mask, p.fprop_dtype)

        return mask
Example #19
 def FProp(self, theta, current_step):
     p = self.params
     step_num = tf.cast(current_step, tf.float32)
     learning_rate = tf.math.rsqrt(tf.maximum(step_num, p.warmup_steps))
     learning_rate *= p.multiplier
     return learning_rate
 def PadToTargetSeqLen(tensor, constant):
     length = tf.shape(tensor)[1]
     pad = tf.maximum(0, p.beam_search.target_seq_len - length)
     return tf.pad(tensor, [[0, 0], [0, pad]], constant_values=constant)
def beam_search_step(in_scores,
    """A single step of beam search.

  Let "b" be the number of beams, "k" be the number hyps in each beam. This
  function supports values with dtypes tf.float32 or tf.bfloat16.

  The following data structures are allocated before the first decoding step and
  are passed along from cur step to the next step:

    in_scores: A tensor of shape [b * k, vocab_size], where [i, ...] is the
      token score of the j-th hyps of the n-th beam. j = (i / k), and n = i % k
    in_atten_probs: A tensor of shape [b*k, s_len], where in_atten_probs[i, ...]
      is the attention probabilities over the source words of the j-th hyps of
      n-th beam (where j, and n are derived as above).
    in_best_scores: A vector of size [b], best scores of terminated hyps so far
      in each of the beams.
    in_cumulative_scores: A vector of size [b * k]. The cumulative score of each
      active hyp before the current step.
    in_histories: An int32 vector of size [b * k] containing hashes of the
      histories of each active hyp. If 'merge_paths' is enabled, the histories
      are used to identify hypotheses that are identical modulo epsilons (e.g.
      "a <eps> b" and "a b <eps>") and merge them. See 'update_histories'
      docstring for details.
    cur_step: Current step id.
    eos_id: Token id of the special end of sequence token.
    num_beams: Number of beams.
    beam_size: Search terminates if the delta between the scores of the active
    num_hyps_per_beam: Number of hyps in a beam.
    valid_eos_max_logit_delta: We allow </s> to terminate a hyp only if its
      logit is no more than 'valid_eos_max_logit_delta' away from the logit of
      the best candidate.
    local_eos_threshold: We allow </s> to terminate a hyp if the local score for
      </s> is greater than local_eos_threshold.
    merge_paths: If true, hyps which are identical when epsilons are removed
      will be combined into a single hyp.  The probability for that combined hyp
      will be the sum of the probabilities of the component hyps.  This can only
      be applied for epsilon-emitting models (RNN-T and NT).
    is_last_chunk: A tensor of shape [b * k, 1]. Used by neural transducer,
      determines whether the current hypothesis reaches the last chunk and
      should treat the next end-of-chunk symbol as end-of-sentence.
    eoc_id: int, the id of the end of chunk (a.k.a epsilon) token used by neural
      transducer models. Only relevant if 'merge_paths' is True or
      'is_last_chunk' is provided.

    out_best_scores: A tensor of shape [b] of updated best scores for each of
      the beams.
    out_cumulative_scores: A tensor of shape [b * k]. The cumulative score of
      the new hyps after the current decoding step.
    out_scores: A tensor of shape [b * k] with scores of the token selected.
    out_eos_scores: A tensor of shape [b * k] with token scores for the EOS, in
      case the hyp was terminated, otherwise 0.0.
    out_hyps: A tensor of shape [b * k] with ids of the token selected.
    out_prev_hyps: A tensor of shape [b * k] with index to the previous hyps
      which was selected.
    out_done_hyps: A boolean tensor of shape [b * k] where value indicates
      if hyps was terminated.
    out_atten_probs: A tensor of shape [b * k, seq_len] which contain the
      attention probabilities over the source words against word in the previous
    out_eos_atten_probs: A tensor of shape [b * k, seq_len] which contains the
      attention probabilities over the source against word in the current hyp
      which was terminated.
    out_all_done: A scalar, whether decoding should terminate for all beams.
    out_histories: A tensor of shape [b * k] containing new history hashes for
      the active hypotheses. See 'update_histories' docstring for details.
    ValueError: if inputs are invalid.
    num_hyps_per_beam = int(num_hyps_per_beam)

    if num_hyps_per_beam <= 0:
        raise ValueError("num_hyps_per_beam = {} and must be > 0.".format(

    in_scores = tf.convert_to_tensor(in_scores)
    num_classes = in_scores.get_shape()[1]

    in_atten_probs = tf.convert_to_tensor(in_atten_probs)

    in_best_scores = tf.convert_to_tensor(in_best_scores)

    in_cumulative_scores = tf.convert_to_tensor(in_cumulative_scores)

    in_histories = tf.convert_to_tensor(in_histories)

    with tf.name_scope("beam_search_step"):
        # For k = num_hyps_per_beam
        # First step of beam search is to find the top tokens based on its score.
        # Normally we select k+1, where the extra +1 is to make sure we have k
        # non-eos tokens to select if EOS token is in the top-k. If path merging is
        # on, we actually need to select k+2; this ensures there are k+1 tokens left
        # after the merge, at least k of which are not EOS.
        # TODO(b/118644069): Avoid casts when there is a XLA op available that takes
        # in bfloat16.
        num_candidates_per_input_hyp = (num_hyps_per_beam + 2 if merge_paths
                                        else num_hyps_per_beam + 1)
        # [b * k, num_candidates_per_input_hyp]
        local_score_values, local_indices = xla_ops.top_k_with_unique(
            tf.cast(in_scores, tf.float32), k=num_candidates_per_input_hyp)
        local_score_values = tf.cast(local_score_values, in_scores.dtype)

        # Compute the global score which is sum of the local score, and the
        # cumulative scores for each of the hyps.
        # [b * k, num_candidates_per_input_hyp]
        global_score_values = local_score_values + tf.expand_dims(
            in_cumulative_scores, 1)

        values_dtype = local_score_values.dtype
        is_first_step = tf.cast(tf.equal(cur_step, 0), values_dtype)

        # Preprocessing to reorder the tensor from `mod` sharding to `div` so that
        # we can use matrix/vector operations to complete the beam search.
        # [b * k, num_candidates_per_input_hyp]
        global_score_values = reorder_tensor("mod_to_div", global_score_values,
                                             num_beams, num_hyps_per_beam)
        local_score_values = reorder_tensor("mod_to_div", local_score_values,
                                            num_beams, num_hyps_per_beam)
        local_indices = reorder_tensor("mod_to_div",
                                       max_value=num_classes - 1)
        # [b * k, 1]
        histories = reorder_tensor("mod_to_div",
                                   tf.expand_dims(in_histories, 1), num_beams,
        if is_last_chunk is None:
            is_last_chunk = tf.zeros([num_beams * num_hyps_per_beam, 1],
            is_last_chunk = tf.cast(
                               [num_beams * num_hyps_per_beam, 1]), num_beams,
                    num_hyps_per_beam), tf.bool)

        # For the first step mask everything but the first row.
        # [num_hyps_per_beam]
        per_example_mask = tf.concat([
            tf.constant([1.0], dtype=values_dtype),
            tf.zeros([num_hyps_per_beam - 1], dtype=values_dtype)
        ], 0)
        # [num_hyps_per_beam, num_beams] => [b*k, 1]
        mask = tf.reshape(
            tf.tile(per_example_mask, tf.expand_dims(num_beams, 0)),
            [-1, 1]) * is_first_step + (1.0 - is_first_step)
        local_score_values *= mask
        global_score_values *= mask

        # We add a large negative value for the unmasked values.
        per_example_additive_mask = tf.concat([
            tf.constant([0.0], dtype=values_dtype),
                        shape=[num_hyps_per_beam - 1],
        ], 0)
        additive_mask = tf.reshape(
            tf.tile(per_example_additive_mask, tf.expand_dims(num_beams, 0)),
            [-1, 1]) * is_first_step
        local_score_values += additive_mask
        global_score_values += additive_mask

        if merge_paths:
            with tf.name_scope("merge_paths"):
                # Compute new history hashes for each hypothesis + new token.
                # [b * k, num_candidates_per_input_hyp]
                histories = update_histories(histories,
                global_score_values, histories = merge_hyps(
                    global_score_values, histories, mask, num_beams,

        # As we keep num_candidates_per_input_hyp, we have a total of
        # num_candidates_per_input_hyp * k hyps active per example.
        num_candidate_hyps = num_candidates_per_input_hyp * num_hyps_per_beam
        batch_shape = [-1, num_candidate_hyps]

        # Reshape score values so that each row corresponds to a particular example.
        # [num_beams, num_candidate_hyps]
        global_score_values_batch = tf.reshape(global_score_values,

        # First for each beam: Find the top 2 * num_hyps_per_beam candidates.
        # The factor of 2 is to be able to process non EOS token ids in the case
        # where top scoring token for each hyps is EOS token.
        # [k * b, 2 * num_hyps_per_beam]
        _, candidates_indices_in_top_k = xla_ops.top_k_with_unique(
            tf.cast(global_score_values_batch, tf.float32),
            k=2 * num_hyps_per_beam)
        # Find the previous hyps of the candidate. We divide here by (k+1) to
        # identify which hyps this token came from.
        hyps_id = candidates_indices_in_top_k // num_candidates_per_input_hyp

        # Add in offset so that we can get the candidate index in the [b * k] space.
        offset = tf.expand_dims(tf.range(num_beams) * num_candidate_hyps, 1)
        flat_candidates_indices_in_top_k = tf.reshape(
            candidates_indices_in_top_k + offset, [-1])

        flat_local_indices = tf.reshape(local_indices, [1, -1])
        flat_token_scores = tf.reshape(local_score_values, [-1, 1])
        flat_global_scores = tf.reshape(global_score_values, [-1, 1])

        # Gather the token scores for each of 2*k candidates. We use tf.one_hot()
        # followed by a tf.matmul() to speedup gather on TPUs.
        total_num_candidates = num_beams * num_candidate_hyps
        token_scores_for_beam = tf.reshape(
            fast_gather(flat_token_scores, flat_candidates_indices_in_top_k,
            [num_beams, 2 * num_hyps_per_beam])
        token_scores_for_beam_shape = tf.shape(token_scores_for_beam)

        global_scores_for_beam = tf.reshape(
            fast_gather(flat_global_scores, flat_candidates_indices_in_top_k,
                        total_num_candidates), token_scores_for_beam_shape)

        # Local indices value's are between [0, vocab_size-1], hence we use the
        # slower version of gather.
        token_ids_for_beam = tf.reshape(
                        max_value=num_classes - 1,
                        axis=1), token_scores_for_beam_shape)

        # We have access to 2*num_hyps_per_beam hyps per beam.
        # We shrink back to num_hyps_per_beam that does not include EOS, and move
        # EOS that occurs in top-num_hyps_per_beam to the EOS done matrix.

        # To determine the threshold at which eos is allowed to terminate a hyp,
        # we need to know the maximum global score for that hyp with any additional
        # token. If path merging is *not* enabled, the global_score_values are
        # by construction in sorted order, so we can just look at its 0th column. If
        # path merging is enabled, the global scores of deleted (merged) hyps break
        # the sorted order, which means we have to do a full reduce_max.
        if merge_paths:
            max_global_score_per_input_hyp = tf.reduce_max(global_score_values,
            max_global_score_per_input_hyp = global_score_values[:, 0:1]
        # [num_beams * num_hyps_per_beam, 1]
        global_eos_threshold = (max_global_score_per_input_hyp -
        local_eos_threshold_tensor = local_eos_threshold * tf.ones_like(

        # Find EOS in top num_hyps_per_beam token ids. We also treat EOC as EOS if
        # the model has indicated this is the last chunk.
        local_index_is_eos = tf.equal(local_indices, eos_id)
        local_index_is_last_chunk_eoc = tf.math.logical_and(
            tf.equal(local_indices, eoc_id), is_last_chunk)
        eos_mask = tf.math.logical_and(
                                [1, num_candidates_per_input_hyp])),
                                [1, num_candidates_per_input_hyp]))),
            tf.cast(mask, tf.bool))
        end_hyps_bool_mask = tf.reshape(tf.reduce_any(eos_mask, 1), [-1, 1])

        end_hyps_bool_mask = reorder_tensor("div_to_mod", end_hyps_bool_mask,
                                            num_beams, num_hyps_per_beam)

        eos_atten_probs = in_atten_probs * tf.cast(end_hyps_bool_mask,
        eos_atten_probs = tf.reshape(eos_atten_probs,
                                     [num_beams * num_hyps_per_beam, -1])
        # A boolean tensor of shape [b * k] where value indicates if hyps was
        # terminated.
        out_done_hyps = tf.reshape(end_hyps_bool_mask, [-1])

        # Scores for EOS token.
        eos_float_mask = tf.cast(eos_mask, values_dtype)
        eos_local_scores = eos_float_mask * local_score_values
        eos_additive_float_mask = (1.0 - eos_float_mask) * BEST_SCORES_INIT
        eos_local_scores += eos_additive_float_mask
        out_eos_scores = tf.reshape(tf.reduce_max(eos_local_scores, 1),
                                    [-1, 1])
        out_eos_scores = tf.reshape(
            reorder_tensor("div_to_mod", out_eos_scores, num_beams,
                           num_hyps_per_beam), [-1])
        # A tensor of shape [b] of updated best scores for each of the beams.
        eos_global_scores = eos_float_mask * global_score_values
        eos_global_scores += eos_additive_float_mask
        best_scores = tf.reduce_max(
            tf.reshape(eos_global_scores, [num_beams, -1]), 1)

        # Following operations are to finds the top num_hyps_per_beam that are
        # active.

        # Active ones are the ones that do not correspond to EOS termination.
        # We keep num_hyps_per_beam * 2 in case every hyps is terminated by EOS id.
        # Top K with eos removed.
        non_eos_mask = tf.not_equal(token_ids_for_beam, eos_id)
        num_candidate_hyps = num_hyps_per_beam * 2 * num_beams
        index = tf.where(
            tf.reshape(tf.range(num_candidate_hyps, dtype=tf.int32),
            num_candidate_hyps *
            tf.ones(dtype=tf.int32, shape=token_scores_for_beam_shape))

        # Unrolled TopK.
        sorted_indices = []
        # Finds the first num_hyps_per_beam unmasked indexes and stores them in
        # concated_index (shape: [num_beams, num_candidate_hyps])
        # This is done by iteratively record the min index in each row, and reset
        # it to the max, so that next iteration reduce_min returns the 2nd minimum
        # index.
        for _ in range(num_hyps_per_beam):
            min_index = tf.reshape(tf.reduce_min(index, [1]), [num_beams, 1])
            # Replace position with num_candidate_hyps value.
            index = tf.where(
                tf.equal(index, min_index),
                num_candidate_hyps *
                tf.ones(dtype=tf.int32, shape=token_scores_for_beam_shape),

        # Post processing ops to output expected tensors.
        concated_sorted_indices = tf.concat(sorted_indices, 1)
        flat_sorted_indices = tf.reshape(concated_sorted_indices, [-1])

        # A tensor of shape [b * k] with scores of the token selected.
        out_scores = tf.reshape(
            fast_gather(tf.reshape(token_scores_for_beam, [-1, 1]),
                        flat_sorted_indices, num_candidate_hyps), [-1, 1])
        out_scores = tf.reshape(
            reorder_tensor("div_to_mod", out_scores, num_beams,
                           num_hyps_per_beam), [-1])

        # Gather the updated histories of selected hypotheses if path merging is
        # enabled. Otherwise, the histories are unused, so just output in_histories.
        if merge_paths:
            flat_histories = tf.reshape(histories, [-1, 1])
            # [num_beams, 2 * num_hyps_per_beam]
            histories_for_beam = tf.reshape(
                fast_gather(flat_histories, flat_candidates_indices_in_top_k,
                            total_num_candidates), token_scores_for_beam_shape)
            out_histories = tf.reshape(
                fast_gather(tf.reshape(histories_for_beam, [-1, 1]),
                            flat_sorted_indices, num_candidate_hyps), [-1, 1])
            out_histories = tf.reshape(
                reorder_tensor("div_to_mod", out_histories, num_beams,
                               num_hyps_per_beam), [-1])
            out_histories = in_histories

        prev_hyps_ids = tf.reshape(
                fast_gather(tf.reshape(hyps_id, [1, -1]),
                            axis=1), [num_beams, -1]) * num_beams +
            tf.expand_dims(tf.range(num_beams), 1), [-1, 1])

        prev_hyps_ids = reorder_tensor("div_to_mod",
        # A tensor of shape [b * k] with index to the previous hyps which was
        # selected.
        out_prev_hyps = tf.reshape(prev_hyps_ids, [-1])

        # A tensor of shape [b * k, seq_len] which contain the attention
        # probabilities over the source words against word in the previous hyps.
        out_atten_probs = tf.reshape(
            fast_gather(in_atten_probs, out_prev_hyps,
                        num_beams * num_hyps_per_beam),
            [num_beams * num_hyps_per_beam, -1])

        sorted_top_k_ids = fast_gather(tf.reshape(token_ids_for_beam, [1, -1]),
                                       max_value=num_classes - 1,
        sorted_top_k_ids = reorder_tensor("div_to_mod",
                                          max_value=num_classes - 1,

        # A tensor of shape [b * k] with ids of the token selected.
        out_hyps = tf.reshape(sorted_top_k_ids, [-1])

        # A tensor of shape [b * k]. The cumulative score of the selected hyps after
        # the current decoding step.
        out_cumulative_scores = tf.reshape(
            fast_gather(tf.reshape(global_scores_for_beam, [-1, 1]),
                        flat_sorted_indices, num_candidate_hyps), [-1, 1])

        out_cumulative_scores = tf.reshape(
            reorder_tensor("div_to_mod", out_cumulative_scores, num_beams,
                           num_hyps_per_beam), [-1])
        out_best_scores = tf.maximum(best_scores, in_best_scores)

        # A scalar, whether decoding should terminate for all beams.
        out_all_done = tf.reshape(
                                tf.reshape(out_best_scores - beam_size,
                                           [-1, 1]), [1, num_hyps_per_beam]),
                            [-1])))), [])

        return (out_best_scores, out_cumulative_scores, out_scores,
                out_eos_scores, out_hyps, out_prev_hyps, out_done_hyps,
                out_atten_probs, eos_atten_probs, out_all_done, out_histories)
def merge_hyps(global_score_values, histories_in, mask, num_beams,
    """Merges candidate hypotheses with identical histories.

  This function takes a set of candidate hypotheses, represented as Tensors of
  scores and histories, and merges all pairs of hypotheses that have identical
  history hashes. When two hypotheses are merged, the hyp with lower global
  score gets "deleted" and has its probability mass added to the higher scoring
  one. Hypotheses are "deleted" by giving them empty history and a large
  negative global score. The function output is a tuple of new
  (global_score_values, histories) Tensors.

  All input Tensors are assumed to be in "div" hypothesis ordering. That is,
  element [i, ...] corresponds to the j-th hyp of the n-th beam, where j = i % k
  and n = i / k.

    Suppose num_beams = 1, num_hyps_per_beam = 2, candidates_per_hyp = 5,
    global_score_values is
      [[11 12 13 14 15],
       [17 16 10 19 20]]
    and histories_in is
      [[1 2 3 4 5],
       [5 6 3 7 8]].

    There are two pairs of hypotheses with identical histories that should
    be merged -- two with hash value 3 and two with hash 5. In each pair, the
    one with lower score will be deleted and merged into the one with higher

    The output is a new set of global_score_values,
      [[ 11     12 13.04 14 -1e34 ],
         17.13  16 -1e34 19 20    ]]
    and new histories
      [[1 2 3 4 0],
       [5 6 0 7 8]].
    Hypotheses deleted in the merge now have zero history and a large negative
    score. The destination of each merge now has additional probability mass.
    (Note _log_sum_exp(13, 10) ~= 13.04 and _log_sum_exp(15, 17) ~= 17.13.)

    global_score_values: Tensor of shape [b * k, candidates_per_hyp], the global
      scores of each candidate hypothesis.
    histories_in: int32 Tensor of shape [b * k, candidates_per_hyp], the
      histories of each candidate hypothesis.
    mask: Tensor of shape [b * k, 1] indicating which entries in
      global_score_values and histories_in are valid.
    num_beams: int, the number of beams (b above).
    num_hyps_per_beam: int, the number of hypotheses per beam (k above).

    A tuple of new (global_score_values, histories) updated so that input
    hypotheses with identical histories are now merged. Hypotheses deleted in
    the merge have a new global score of BEST_SCORES_INIT and a history of 0.
    values_dtype = global_score_values.dtype
    candidates_per_hyp = histories_in.get_shape()[1]
    k = num_hyps_per_beam

    # High-level strategy: To detect hyps to merge, we'll permute the hypotheses
    # within each beam so that their histories are in sorted order. We can then
    # in parallel check whether each history is equal to its left or right
    # neighbor (i.e. whether the hyps should be merged), and if so, which of them
    # has the higher global score (the direction of the merge). When two hyps need
    # to be merged, we'll "delete" the one with lower score (by giving it a large
    # negative score and empty history) and add its probability mass to the other.
    # Note we only have to do pair-wise merging once per beam search step, because
    # (ignoring hash collisions) there are at most two candidate hypotheses with
    # any particular history. This follows from the fact that hypotheses are
    # unique at the start of the beam search step, as are the top K non-epsilon
    # extensions of those hypotheses. Thus, if there are two paths with
    # identical histories, they must have the form
    #   h_i <eps> == h_j s  (for some i != j, s != eps),
    # where h_i and h_j are distinct input hypotheses, and s is some non-epsilon
    # symbol.

    # Reshape inputs to [b, num_hyps_per_beam * candidates_per_hyp] so they're
    # grouped by beam.
    histories = histories_in
    orig_scores_shape = tf.shape(global_score_values)
    histories = tf.reshape(histories, [num_beams, -1])
    histories_valid = tf.cast(
        tf.reshape(tf.tile(mask, [1, candidates_per_hyp]), [num_beams, -1]),
    # Compute the permutation of hyps within each beam that put the histories in
    # sorted order, and the one that permutates the sorted hyps back to the
    # original order.
    sorted_history_indices = tf.argsort(histories, axis=1)
    inverse_indices = tf.argsort(sorted_history_indices, axis=1)

    def to_flat_indices(column_indices_per_row):
        flat_indices = (column_indices_per_row +
                        num_hyps_per_beam * candidates_per_hyp *
                        tf.reshape(tf.range(num_beams), [num_beams, 1]))
        return tf.reshape(flat_indices, [-1])

    # Convert to linear indices so we can use fast_gather.
    sorted_history_indices_flat = to_flat_indices(sorted_history_indices)
    inverse_indices_flat = to_flat_indices(inverse_indices)

    def history_sort(values):
        return tf.reshape(
                                   [-1, 1]), sorted_history_indices_flat,
                        num_beams * k * candidates_per_hyp),
            [num_beams, k * candidates_per_hyp])

    def history_unsort(values):
        return tf.reshape(
            fast_gather(tf.reshape(values, [-1, 1]), inverse_indices_flat,
                        num_beams * k * candidates_per_hyp), orig_scores_shape)

    sorted_histories = history_sort(histories)
    sorted_histories_valid = history_sort(histories_valid)

    # Indicators of whether each hypothesis is a duplicate of its left/right
    # neighbors.
    # [num_batches, k * candidates_per_hyp - 1]
    dup_mask = tf.cast(
        tf.equal(sorted_histories[:, 1:], sorted_histories[:, :-1]),
        values_dtype) * (sorted_histories_valid[:, 1:] *
                         sorted_histories_valid[:, :-1])
    padding = tf.zeros([num_beams, 1], dtype=values_dtype)
    is_dup_of_left = tf.concat([padding, dup_mask], axis=1)
    is_dup_of_right = tf.concat([dup_mask, padding], axis=1)

    # Examine global scores to see which hyps should be merged, and within those
    # cases, which hyps get deleted/retained in the merge.
    sorted_global_scores = history_sort(global_score_values)
    # Global scores of each hyp's left and right neighbors.
    right_global_scores = tf.concat([sorted_global_scores[:, 1:], padding],
    left_global_scores = tf.concat([padding, sorted_global_scores[:, :-1]],

    # Masks indicating whether each candidate hyp is better or worse than its
    # left or right neighbor.
    is_better_than_right = tf.cast(
        tf.greater_equal(sorted_global_scores, right_global_scores),
    is_worse_than_right = 1.0 - is_better_than_right
    is_better_than_left = tf.cast(
        tf.greater(sorted_global_scores, left_global_scores), values_dtype)
    is_worse_than_left = 1.0 - is_better_than_left

    # Determine which hypotheses need to be merged.
    is_merge_source = tf.minimum(
        is_dup_of_left * is_worse_than_left +
        is_dup_of_right * is_worse_than_right, 1.0)
    is_left_merge_dest = is_dup_of_left * is_better_than_left
    is_right_merge_dest = is_dup_of_right * is_better_than_right
    is_merge_dest = tf.minimum(is_left_merge_dest + is_right_merge_dest, 1.0)
    # Mask of hyps unaffected by merging.
    is_unchanged = tf.maximum(1.0 - is_merge_source - is_merge_dest, 0.0)

    sorted_global_scores = (
        is_unchanged * sorted_global_scores +
        is_merge_source * BEST_SCORES_INIT + is_left_merge_dest *
        _log_sum_exp(left_global_scores, sorted_global_scores) +
        is_right_merge_dest *
        _log_sum_exp(right_global_scores, sorted_global_scores))
    # Set histories of deleted (merge source) hyps to zero.
    sorted_histories *= tf.cast(1.0 - is_merge_source, sorted_histories.dtype)

    # Put everything back in its original order and rank.
    global_score_values_out = history_unsort(sorted_global_scores)
    histories_out = history_unsort(sorted_histories)
    return global_score_values_out, histories_out
def _log_sum_exp(a, b):
    m = tf.maximum(a, b)
    return m + tf.math.log(tf.exp(a - m) + tf.exp(b - m))