def EinsumBxycBzxBzyc(self, a, b, name=None):
     """Portable replacement for tf.einsum('bxyc,bzx->bzyc', a, b)."""
     expanded_a = tf.expand_dims(a, axis=1)
     expanded_b = tf.expand_dims(tf.expand_dims(b, axis=-1), axis=-1)
     return tf.reduce_sum(tf.math.multiply(expanded_a,
                                           expanded_b,
                                           name=name),
                          axis=2)
    def FProp(self, theta, input_batch):
        p = self.params
        with tf.name_scope(p.name):
            inputs = py_utils.with_dependencies([
                py_utils.assert_shape_match(tf.shape(input_batch.ids),
                                            [-1, -1]),
                py_utils.assert_shape_match(tf.shape(input_batch.ids),
                                            tf.shape(input_batch.paddings))
            ], tf.transpose(input_batch.ids))
            paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2)
            if p.packed_input:
                src_segment_id = tf.expand_dims(
                    tf.transpose(input_batch.segment_ids), 2)
            else:
                src_segment_id = None
            xs = self.emb.EmbLookup(theta.emb, inputs)
            xs = self.ApplyClipping(theta, xs)
            summary_utils.histogram('input_emb', xs)
            xs = self.dropout.FProp(theta.dropout, xs)
            ps = paddings
            # Now the rnn layers.
            outputs_list = []
            for i in range(0, p.num_lstm_layers):
                layer = self.rnn[i]
                ys = layer.FProp(theta.rnn[i],
                                 xs,
                                 ps,
                                 segment_id=src_segment_id)
                ys = self.dropout.FProp(theta.dropout, ys)
                if i >= p.residual_start:
                    xs += ys  # Residual skip
                    xs = self.ApplyClipping(theta, xs)
                else:
                    xs = ys
                outputs_list.append(xs)
                summary_utils.histogram('layer_out_%s' % i, xs)

            if p.is_transparent:
                xs = self.transparent_merger.FProp(theta.transparent_merger,
                                                   outputs_list)

            if p.lstm_cell_size * 2 != p.encoder_out_dim:
                # Project to the right depth.
                xs = self.final_proj.FProp(theta.final_proj, xs, ps)
                summary_utils.histogram('final_proj_out', xs)

            if src_segment_id is not None:
                src_segment_id = tf.squeeze(src_segment_id, [2])

            return py_utils.NestedMap(encoded=xs,
                                      padding=tf.squeeze(ps, [2]),
                                      segment_id=src_segment_id)
def SequenceConcat(x, x_paddings, y, y_paddings, pad=0):
    """Concats sequence `x` with sequence `y`.

  This function is length aware (based off the paddings).

  Args:
    x: A sequence of tokens of shape [batch_size, x_len_max].
    x_paddings: The paddings of `x`.
    y: A sequence of tokens of shape [batch_size, y_len_max].
    y_paddings: The paddings of `y`.
    pad: The <pad> token to fill the concatenated sequence (of type integer).

  Returns:
    A tuple.
      - Concatenation of `x` and `y` of shape
        [batch_size, x_len_max + y_len_max].
      - Paddings of the concatenation of shape
        [batch_size, x_len_max + y_len_max].
  """
    # Get the length (w/ eos).
    x_len = tf.cast(tf.round(tf.reduce_sum(1 - x_paddings, 1)), tf.int32)
    y_len = tf.cast(tf.round(tf.reduce_sum(1 - y_paddings, 1)), tf.int32)

    batch_size = py_utils.GetShape(x)[0]
    y_len_max = py_utils.GetShape(y)[1]

    # Pad `x` with necessary <pad>.
    x = tf.concat([x, tf.fill(py_utils.GetShape(y), pad)], 1)
    # Replace all <pad> with 0.
    x = tf.where(tf.not_equal(x, pad), x, tf.fill(py_utils.GetShape(x), 0))

    # Compute the write indices of `y` in `xy`.
    indices = tf.stack([
        tf.tile(tf.expand_dims(tf.range(batch_size), 1), [1, y_len_max]),
        (tf.tile(tf.expand_dims(tf.range(y_len_max), 0), [batch_size, 1]) +
         tf.expand_dims(x_len, 1)),
    ], 2)

    xy = x + tf.scatter_nd(indices, y, py_utils.GetShape(x))

    # We need to remap all <pad> to `pad`.
    xy = tf.where(
        tf.less(tf.expand_dims(tf.range(py_utils.GetShape(xy)[1]), 0),
                tf.expand_dims(x_len + y_len, 1)), xy,
        tf.fill(py_utils.GetShape(xy), pad))
    xy_paddings = 1 - tf.sequence_mask(x_len + y_len,
                                       py_utils.GetShape(xy)[1],
                                       x_paddings.dtype)
    return xy, xy_paddings
Beispiel #4
0
def ComputeConvOutputPadding(paddings, window, stride,
                             padding_algorithm='SAME'):
  """Computes paddings for convolution and pooling output.

  out_padding[i] == 1 iff any in_padding corresponding to that output is 1.

  Args:
    paddings: The paddings tensor. It is expected to be of shape [batch, time].
    window: The size of the windows.
    stride: The time-stride between adjacent windows.
    padding_algorithm: 'SAME' or 'VALID'.

  Returns:
    out_padding, The new padding tensor of size [batch, ceil(time / stride)].
  """
  if stride == 1:
    return paddings

  # Pad so input_length divides stride.
  input_length = py_utils.GetShape(paddings)[1]
  pad_len = (input_length + stride - 1) // stride * stride - input_length
  paddings = tf.pad(paddings, [[0, 0], [0, pad_len]], constant_values=1.0)
  out_padding = tf.nn.pool(
      tf.expand_dims(paddings, -1),
      [window],
      'MAX',
      padding=padding_algorithm,
      strides=[stride],
  )
  return tf.squeeze(out_padding, -1)
 def Step(recurrent_theta, state0, inputs):
     """Computes one decoder step."""
     del inputs
     with tf.name_scope('single_sampler_step'):
         # Compute logits and states.
         bs_result, bs_state1 = pre_step_callback(
             recurrent_theta.theta,
             recurrent_theta.encoder_outputs,
             tf.expand_dims(state0.ids, 1),  # [batch, 1].
             state0.bs_state,
             num_hyps_per_beam=1)
         batch = tf.shape(bs_result.log_probs)[0]
         state1 = py_utils.NestedMap(timestep=state0.timestep + 1)
         state1.logits = bs_result.log_probs
         # Sample ids from logits. [batch].
         state1.ids = tf.reshape(
             tf.random.stateless_categorical(
                 state1.logits / p.temperature,
                 num_samples=1,
                 seed=tf.stack(
                     [recurrent_theta.random_seed, state0.timestep]),
                 dtype=state0.ids.dtype,
                 name='sample_next_id'), [batch])
         if 'is_last_chunk' in bs_result and p.target_eoc_id >= 0:
             state1.ids = tf.where(
                 tf.math.logical_and(
                     bs_result.is_last_chunk,
                     tf.equal(state1.ids, p.target_eoc_id)),
                 tf.fill(tf.shape(state1.ids), p.target_eos_id),
                 state1.ids)
         state1.bs_state = post_step_callback(
             recurrent_theta.theta, recurrent_theta.encoder_outputs,
             state1.ids, bs_state1)
     return state1, py_utils.NestedMap()
    def FProp(self, theta, input_batch):
        """Encodes source as represented by `inputs` and `paddings`.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      input_batch: A `.NestedMap` with fields:
        - ids: The inputs tensor. It is expected to be of shape [batch, time].
        - paddings: The paddings tensor. Expected shape [batch, time].

    Returns:
      A NestedMap containing:

      - encoded: The encoded features, a tensor of shape [time, batch, depth]
      - padding: of shape [time, batch]
      - segment_id: [time, batch] if packed inputs are supported by the model
        (and all layers), or None otherwise.
    """
        p = self.params
        src_segment_id = None
        with tf.name_scope(p.name):
            # Now the rnn layers.
            inputs = py_utils.with_dependencies([
                py_utils.assert_shape_match(tf.shape(input_batch.ids),
                                            [-1, -1]),
                py_utils.assert_shape_match(tf.shape(input_batch.ids),
                                            tf.shape(input_batch.paddings))
            ], tf.transpose(input_batch.ids))
            paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2)
            xs = self.emb.EmbLookup(theta.emb, inputs)
            xs = self.ApplyClipping(theta, xs)
            self._emb_out = xs
            ps = paddings
            # When cc_schedule is specified, make sure lstm_tpl is QuantizedLSTMCell
            # with the same cc_schedule so that the RNN layer output is within
            # clipping range.
            xs = self.rnn[0].FProp(theta.rnn[0], xs, ps)
            xs = self.dropout.FProp(theta.dropout, xs)
            for i in range(1, p.num_lstm_layers):
                layer = self.rnn[i]
                ys, _ = layer.FProp(theta.rnn[i], xs, ps)
                ys = self.dropout.FProp(theta.dropout, ys)
                if hasattr(layer.params, 'cell'):
                    layer_params = layer.params.cell
                else:
                    layer_params = layer.params
                if layer_params.num_input_nodes == layer_params.num_output_nodes:
                    xs += ys  # Residual skip
                    xs = self.ApplyClipping(theta, xs)
                else:
                    # When cc_schedule is specified, make sure lstm_tpl is
                    # QuantizedLSTMCell with the same cc_schedule so that the RNN layer
                    # output is within clipping range.
                    xs = ys
            return py_utils.NestedMap(encoded=xs,
                                      padding=tf.squeeze(ps, [2]),
                                      segment_id=src_segment_id)
    def FProp(self, theta, inputs, paddings, domain_ids=None):
        """Applies data augmentation by randomly mask spectrum in inputs.

    Args:
      theta: A NestedMap object containing weights' values of this layer and its
        children layers.
      inputs: A tensor of shape [batch, time, freq, num_channels].
      paddings: A 0/1 tensor of shape [batch, time].
      domain_ids: input domain_ids of shape [batch, time].

    Returns:
      A pair of 2 tensors:

      - augmented_inputs: A tensor of shape [batch, time, freq, num_channels].
      - paddings: A 0/1 tensor of shape [batch, time].
    """
        p = self.params

        global_seed = None  # A tensor seed in case stateless random ops are needed.
        if p.use_input_dependent_random_seed:
            global_seed = _global_seed_from_inputs(inputs)

        batch_size, series_length, _, _ = py_utils.GetShape(inputs)
        if len(p.domain_ids) > 1:
            augmented_inputs = tf.zeros_like(inputs)
            original_inputs = inputs
            for i, domain_id in enumerate(p.domain_ids):
                augmented_domain = self._AugmentationNetwork(
                    series_length,
                    inputs,
                    paddings,
                    global_seed=global_seed,
                    domain_id_index=i)
                target_domain = tf.cast(tf.expand_dims(
                    tf.tile([domain_id], [batch_size]), -1),
                                        dtype=p.dtype)
                # [batch, time].
                domain_mask = tf.cast(tf.equal(domain_ids, target_domain),
                                      dtype=p.dtype)
                augmented_domain = self.EinsumBxycBxBxyc(
                    augmented_domain, domain_mask, name='einsum_domainmasking')
                original_inputs = self.EinsumBxycBxBxyc(
                    original_inputs,
                    1.0 - domain_mask,
                    name='einsum_domainmasking2')
                augmented_inputs = augmented_domain + augmented_inputs
            augmented_inputs = original_inputs + augmented_inputs
        else:
            augmented_inputs = self._AugmentationNetwork(
                series_length,
                inputs,
                paddings,
                global_seed=global_seed,
                domain_id_index=0)
        return augmented_inputs, paddings
            def ApplyBias():
                """Bias and update log_probs and consistent."""
                def TileForBeamAndFlatten(tensor):
                    tensor = tf.reshape(tensor, [1, -1])  # [1, src_batch]
                    tensor = tf.tile(tensor,
                                     [num_hyps_per_beam, 1
                                      ])  # [num_hyps_per_beam, src_batch]
                    tgt_batch = tf.shape(step_ids)[
                        0]  # num_hyps_per_beam*src_batch
                    return tf.reshape(tensor, [tgt_batch])

                # Consistent if step_ids == labels from previous step
                # TODO(navari): Consider updating consistent only if weights > 0. Then
                # re-evaluate the need for bias_only_if_consistent=True.
                # Note that prev_label is incorrrect for step 0 but is overridden later
                prev_label = TileForBeamAndFlatten(
                    tf.gather(labels, tf.maximum(time_step - 1, 0), axis=1))
                is_step0 = tf.equal(time_step, 0)
                local_consistence = tf.math.logical_or(
                    is_step0, tf.equal(prev_label, tf.squeeze(step_ids, 1)))
                consistent = tf.math.logical_and(states.consistent,
                                                 local_consistence)

                # get label, weight slices corresponding to current time_step
                label = TileForBeamAndFlatten(
                    tf.gather(labels, time_step, axis=1))
                weight = TileForBeamAndFlatten(
                    tf.gather(weights, time_step, axis=1))
                if p.bias_only_if_consistent:
                    weight = weight * tf.cast(consistent, p.dtype)

                # convert from dense label to sparse label probs
                vocab_size = tf.shape(bs_results.log_probs)[1]
                uncertainty = tf.constant(
                    1e-10,
                    p.dtype)  # avoid 0 probs which may cause issues with log
                label_probs = tf.one_hot(
                    label,
                    vocab_size,
                    on_value=1 - uncertainty,
                    off_value=uncertainty / tf.cast(vocab_size - 1, p.dtype),
                    dtype=p.dtype)  # [tgt_batch, vocab_size]
                pred_probs = tf.exp(bs_results.log_probs)

                # interpolate predicted probs and label probs
                weight = tf.expand_dims(weight, 1)
                probs = py_utils.with_dependencies([
                    py_utils.assert_less_equal(weight, 1.),
                    py_utils.assert_greater_equal(weight, 0.)
                ], (1.0 - weight) * pred_probs + weight * label_probs)
                return tf.math.log(probs), consistent
 def GetEncoderEmbeddingsDefaultTheta(self, input_ids, task_ids=None):
     p = self.params
     time_dim = 0 if p.batch_dim else 1
     seq_len = tf.shape(input_ids)[time_dim]
     input_embs = self.src_token_emb.EmbLookup(self.theta.src_token_emb,
                                               input_ids)
     pos_embs = tf.expand_dims(
         self.src_pos_emb.FProp(self.theta.src_pos_emb, seq_len),
         p.batch_dim)
     input_embs += pos_embs
     if task_ids is not None and p.enc_task_emb:
         input_embs += self.src_task_emb.EmbLookup(self.theta.src_task_emb,
                                                   task_ids)
     input_embs = self.src_dropout.FProp(self.theta.src_dropout, input_embs)
     return input_embs
    def FProp(self, theta, input_batch, state0=None):
        p = self.params
        src_segment_id = None
        with tf.name_scope(p.name):
            # Reshape to [t, b]
            inputs = py_utils.with_dependencies([
                py_utils.assert_shape_match(tf.shape(input_batch.ids),
                                            [-1, -1]),
                py_utils.assert_shape_match(tf.shape(input_batch.ids),
                                            tf.shape(input_batch.paddings))
            ], tf.transpose(input_batch.ids))
            paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2)

            # Setup streaming states.
            if not state0:
                state0 = self.zero_state(theta, tf.shape(inputs)[1])
            state1 = py_utils.NestedMap(rnn=[None] * p.num_lstm_layers)

            xs = self.emb.EmbLookup(theta.emb, inputs)
            xs = self.ApplyClipping(theta, xs)
            summary_utils.histogram('input_emb', xs)
            xs = self.dropout.FProp(theta.dropout, xs)
            ps = paddings
            # Now the rnn layers.
            outputs_list = []
            for i in range(0, p.num_lstm_layers):
                layer = self.rnn[i]
                ys, state1.rnn[i] = layer.FProp(theta.rnn[i],
                                                xs,
                                                ps,
                                                state0=state0.rnn[i])
                ys = self.dropout.FProp(theta.dropout, ys)
                if i >= p.residual_start:
                    xs += ys  # Residual skip
                    xs = self.ApplyClipping(theta, xs)
                else:
                    xs = ys
                outputs_list.append(xs)
                summary_utils.histogram('layer_out_%s' % i, xs)

            if p.is_transparent:
                xs = self.transparent_merger.FProp(theta.transparent_merger,
                                                   outputs_list)

            return py_utils.NestedMap(encoded=xs,
                                      padding=tf.squeeze(ps, [2]),
                                      segment_id=src_segment_id,
                                      state=state1)
    def GetEmbeddings(self, emb_theta, emb, pos_emb_theta, pos_emb,
                      dropout_theta, dropout, input_ids, input_pos_ids,
                      task_emb_theta, task_emb, task_ids):
        p = self.params
        time_dim = 0 if p.batch_dim else 1
        seq_len = tf.shape(input_ids)[time_dim]
        input_embs = emb.EmbLookup(emb_theta, input_ids)
        if p.packed_input:  # Packed inputs.
            pos_embs = pos_emb.FPropWithPosition(pos_emb_theta, input_pos_ids)
        else:
            pos_embs = tf.expand_dims(pos_emb.FProp(pos_emb_theta, seq_len),
                                      p.batch_dim)

        input_embs += pos_embs
        if task_emb:
            input_embs += task_emb.EmbLookup(task_emb_theta, task_ids)
        input_embs = dropout.FProp(dropout_theta, input_embs)
        return input_embs
 def GetDecoderEmbeddingsDefaultTheta(self,
                                      input_ids,
                                      task_ids=None,
                                      t=None):
     p = self.params
     input_embs = self.tgt_token_emb.EmbLookup(self.theta.tgt_token_emb,
                                               input_ids)
     if t is None:
         time_dim = 0 if p.batch_dim else 1
         seq_len = tf.shape(input_ids)[time_dim]
         pos_embs = tf.expand_dims(
             self.tgt_pos_emb.FProp(self.theta.tgt_pos_emb, seq_len),
             p.batch_dim)
     else:  # Support decoding.
         pos_embs = tf.slice(
             self.tgt_pos_emb.FProp(self.theta.tgt_pos_emb, p.max_seq_len),
             [t, 0], [1, p.token_emb.embedding_dim])
     input_embs += pos_embs
     if task_ids is not None and p.dec_task_emb:
         input_embs += self.tgt_task_emb.EmbLookup(self.theta.tgt_task_emb,
                                                   task_ids)
     input_embs = self.tgt_dropout.FProp(self.theta.tgt_dropout, input_embs)
     return input_embs
Beispiel #13
0
 def _MergeOneToken(tokens, i):
   return tf.expand_dims(
       self._MergeTokens((tokens[i], tokens[i + 1])), axis=-1)
    def FProp(self,
              theta,
              x,
              x_paddings=None,
              eos_id=1,
              force_sample_last_token=True):
        """Applies SymbolInsertionLayer.

    We take in a `x`, which represents the groundtruth sequence (i.e., English
    sequence). We return a sampled rollin (observed) canvas (i.e., random subset
    of the English sequence), as well as the target (indices) for an
    insertion-based model (i.e., the targets given the random observed subset).

    Args:
      theta: Ignored, this can be None.
      x: The symbol ids of shape `[batch_size, time_dim]`.
      x_paddings: The paddings (1 or 0) of shape `[batch_size, time_dim]` where
        0 is valid and 1 is invalid.
      eos_id: The <eos> token id to represent end-of-slot.
      force_sample_last_token: Set True to force sample the last token of `x`.

    Returns:
      A `NestedMap`.
        - canvas: The canvas (based off of the `rollin_policy`) of shape
          [batch_size, c_dim]. Note that, `c_dim` <= `time_dim` but need not be
          equal.
        - canvas_indices: The canvas indices (into `x`).
        - canvas_paddings: The paddings of `canvas_indices`.
        - target_indices: The target indices of shape [num_targets, 3].
          `num_targets` is the number of total targets in the entire batch.
          [:, 0] captures the batch, [:, 1] captures the slot, and [:, 2]
          captures the token. Each row [batch, slot, vocab] represents the
          indices of the target -- i.e., the batch, slot and vocab combination
          of the target. Typical usage of these indices is to tf.gather_nd
          the log-probs (from the softmax layer).
        - target_weights: The target weights.

    Raises:
      ValueError: If invalid params.
    """
        p = self.params

        batch_size = py_utils.GetShape(x)[0]
        time_dim = py_utils.GetShape(x)[1]

        if x_paddings is None:
            x_paddings = tf.zeros([batch_size, time_dim], tf.float32)

        oracle_policy = p.oracle_policy
        rollin_policy = (oracle_policy
                         if p.rollin_policy == 'oracle' else p.rollin_policy)

        if rollin_policy != 'uniform':
            raise ValueError('Unknown or unsupported rollin policy: %s' %
                             rollin_policy)
        if oracle_policy != 'uniform':
            raise ValueError('Unknown or unsupported oracle policy: %s' %
                             oracle_policy)

        x_len = tf.cast(tf.round(tf.reduce_sum(1 - x_paddings, 1)), tf.int32)

        # Compute the desired length per example in the batch.
        ratio = tf.random.uniform([batch_size], 0.0, 1.0, seed=p.random_seed)
        if force_sample_last_token:
            c_len = tf.minimum(
                tf.cast(ratio * tf.cast(x_len, tf.float32), tf.int32),
                x_len - 1) + 1
        else:
            c_len = tf.minimum(
                tf.cast(ratio * tf.cast(x_len + 1, tf.float32), tf.int32),
                x_len)
        # Compute the maximum length across the batch.
        c_len_max = tf.reduce_max(c_len)

        # Grab subset of random valid indices per example.
        z_logits = tf.cast(
            tf.expand_dims(tf.range(time_dim), 0) >= tf.expand_dims(x_len, 1),
            tf.float32) * -1e9
        if force_sample_last_token:
            # Force sample the last token -- i.e., as indexed by `x_len - 1`. We can
            # accomplish this by add +LARGE_NUMBER to the logits.
            z_logits += tf.cast(
                tf.equal(tf.expand_dims(tf.range(time_dim), 0),
                         tf.expand_dims(x_len - 1, 1)), tf.float32) * 1e9
        # Gumbel-max trick to sample (we only sample valid positions per sample in
        # the batch).
        z = -tf.math.log(-tf.math.log(
            tf.random.uniform([batch_size, time_dim], seed=p.random_seed)))
        unused_c_values, c_indices = tf.nn.top_k(z_logits + z, time_dim)

        # Trim everything > c_len_max.
        c_indices = c_indices[:, :c_len_max]

        # Invalidate any indices >= c_len, we use the last index as the default
        # invalid index.
        c_indices = tf.where(
            tf.expand_dims(tf.range(c_len_max), 0) < tf.expand_dims(c_len, 1),
            c_indices, tf.fill(py_utils.GetShape(c_indices), time_dim - 1))

        # Materialize the canvas.
        c_indices = tf.sort(c_indices)
        c = tf.gather_nd(
            x,
            tf.stack([
                tf.reshape(
                    tf.tile(tf.expand_dims(tf.range(batch_size), 1),
                            [1, c_len_max]), [-1]),
                tf.reshape(c_indices, [-1])
            ], 1))
        c = tf.reshape(c, [batch_size, c_len_max])

        # Compute the paddings.
        c_paddings = 1 - tf.sequence_mask(
            c_len, c_len_max, dtype=x_paddings.dtype)
        c *= tf.cast(1 - c_paddings, tf.int32)

        indices = tf.concat([
            tf.reshape(
                tf.tile(tf.expand_dims(tf.range(batch_size), 1),
                        [1, c_len_max]), [batch_size * c_len_max, 1]),
            tf.reshape(c_indices, [batch_size * c_len_max, 1])
        ], 1)
        x_token_is_observed = tf.scatter_nd(
            indices, tf.ones([batch_size * c_len_max], tf.int32),
            py_utils.GetShape(x))
        # `x_segments` captures which slot each `x` belongs to (both observed and
        # tokens that need to be observed).
        x_segments = tf.cumsum(x_token_is_observed, 1, exclusive=True)

        x_token_is_observed = tf.cast(x_token_is_observed, tf.bool)
        prev_x_token_is_observed = tf.pad(x_token_is_observed[:, :-1],
                                          [[0, 0], [1, 0]],
                                          constant_values=True)
        x_token_is_observed = tf.reshape(x_token_is_observed, [-1])
        prev_x_token_is_observed = tf.reshape(prev_x_token_is_observed, [-1])
        x_is_valid = tf.cast(1 - x_paddings, tf.bool)
        x_is_valid = tf.reshape(x_is_valid, [-1])

        # Remap all the observed to <eos>, note some of these need a zero weight
        # (or else there would be <eos> and valid token in the same slot).
        target_indices = tf.cast(tf.reshape(x, [-1, 1]), tf.int32)
        target_indices = tf.where(
            x_token_is_observed,
            tf.fill(py_utils.GetShape(target_indices), eos_id), target_indices)

        # TODO(williamchan): We give uniform 1.0 weight, however, math suggests
        # we may want to weigh this term by the original sequence length.
        target_weights = tf.ones_like(target_indices, tf.float32)

        # We need to set all the weights for <eos> which actually have valid tokens
        # in the slot to zero.
        target_weights = tf.where(
            x_token_is_observed & ~prev_x_token_is_observed,
            tf.zeros_like(target_weights), target_weights)

        # TODO(williamchan): Consider dropping the entries w/ weight zero.

        # Add the batch and slot indices.
        target_indices = tf.concat([
            tf.reshape(
                tf.tile(tf.expand_dims(tf.range(batch_size), 1),
                        [1, time_dim]), [batch_size * time_dim, 1]),
            tf.reshape(x_segments, [-1, 1]), target_indices
        ], 1)

        # Select only the valid indices. The selected valid ones include slots w/
        # <eos>.
        target_indices = target_indices[x_is_valid]
        target_weights = target_weights[x_is_valid]

        return py_utils.NestedMap(canvas=c,
                                  canvas_indices=c_indices,
                                  canvas_paddings=c_paddings,
                                  target_indices=target_indices,
                                  target_weights=target_weights)
 def EinsumBxyBxBxy(self, a, b, name=None):
     """Portable replacement for tf.einsum('bxy,bx->bxy', a, b)."""
     return tf.math.multiply(a, tf.expand_dims(b, axis=2), name=name)
    def _TimeMask(self,
                  inputs,
                  seq_lengths,
                  global_seed,
                  noisify=False,
                  gaussian_noise=False,
                  dtype=tf.float32,
                  domain_id_index=0):
        """Applies time masking with given degree to inputs.

    Args:
      inputs: Batch of input features of shape (batch_size, time_length,
        num_freq, channels).
      seq_lengths: The actual sequence lengths which mask been sampled of shape
        (batch_size,).
      global_seed: an integer seed tensor for stateless random ops.
      noisify: Whether to noisify the masked out regions.
      gaussian_noise: Whether to use gaussian noise when noisifying.
      dtype: Data type.
      domain_id_index: domain id index.

    Returns:
      Inputs with random time masking applied.
    """
        p = self.params

        # Get time masking parameters.
        time_mask_max_frames = p.time_mask_max_frames[domain_id_index]
        time_masks_per_frame = p.time_masks_per_frame[domain_id_index]
        use_dynamic_time_mask_max_frames = \
            p.use_dynamic_time_mask_max_frames[domain_id_index]
        multiplicity = p.time_mask_count[domain_id_index]
        max_ratio = p.time_mask_max_ratio[domain_id_index]

        # If maximum mask length is zero, do nothing.
        if ((time_mask_max_frames == 0
             and not use_dynamic_time_mask_max_frames) or max_ratio <= 0.0):
            return inputs
        if multiplicity == 0:
            return inputs
        seq_lengths = tf.cast(seq_lengths, tf.int32)
        batch_size, time_length, _, _ = py_utils.GetShape(inputs)

        # When using dynamic time mask size, discard upper-bound on
        # maximum allowed frames for time mask.
        if use_dynamic_time_mask_max_frames:
            time_mask_max_frames = None
        # Create masks in time direction and apply.
        block_arrays = self._GetMask(batch_size,
                                     choose_range=seq_lengths,
                                     mask_size=time_length,
                                     global_seed=global_seed,
                                     max_length=time_mask_max_frames,
                                     masks_per_frame=time_masks_per_frame,
                                     multiplicity=multiplicity,
                                     dtype=dtype,
                                     max_ratio=max_ratio)

        # Non-empty random seed values are only used for testing or when using
        # stateless random ops. seed_6 and seed_7 are set separately to avoid
        # correlation of warp magnitude and origin position.
        if p.use_input_dependent_random_seed:
            seed_6 = global_seed + 6
            seed_7 = global_seed + 7
        else:
            seed_6 = p.random_seed
            seed_7 = p.random_seed

        outputs = self.EinsumBxycBxBxyc(inputs,
                                        block_arrays,
                                        name='einsum_formasking')
        if noisify:
            # Sample noise with standard deviation with factor * 0.1 + 0.0001
            # TODO(ngyuzh): Make sure this won't affect EOS.
            if gaussian_noise:
                stddev = 1.0
            else:
                random_uniform = _random_uniform_op(
                    p.use_input_dependent_random_seed)
                factor = random_uniform(shape=(),
                                        minval=1.0,
                                        maxval=2.0,
                                        dtype=dtype,
                                        seed=seed_6)
                stddev = factor * 0.1 + 0.0001
            random_normal = _random_normal_op(
                p.use_input_dependent_random_seed)
            noise = random_normal(shape=[
                tf.shape(inputs)[0],
                tf.shape(inputs)[1],
                tf.shape(inputs)[2]
            ],
                                  stddev=stddev,
                                  seed=seed_7)
            if p.fprop_dtype is not None and p.fprop_dtype != p.dtype:
                noise = tf.cast(noise, p.fprop_dtype)
            outputs_mask = self.EinsumBxyBxBxy(noise,
                                               1.0 - block_arrays,
                                               name='einsum_fornoisymasking')
            outputs = outputs + tf.expand_dims(outputs_mask, -1)

        return outputs
Beispiel #17
0
    def BuildDataSource(self, data_source_from_file_pattern_fn):
        """Read and return input batch from a p.file_pattern list.

    `p.file_patterns` is a list of file patterns, `p.weights` contains
    weights for each file pattern.  If provided `p.bprop_variable_filters`
    includes a bprop_variable_filter for each file pattern.

    Args:
      data_source_from_file_pattern_fn: a function that takes file_pattern as an
        argument and returns an input batch.

    Returns:
      A NestedMap containing:
        data: a tuple of tf.Tensor or `.NestedMap` of tf.Tensor
        source_selected: a tensor of size [batch_size, number of data sources]
        selected_bprop: a tensor of size [number of data sources]
        bprop_variable_filters: containing a list of bprop_variable filters for
        each source

    Raises:
      ValueError: If unknown token type.
    """
        p = self.params

        def _MakeDataSourceFromFilePatternFunc(
                data_source_from_file_pattern_fn, file_pattern):
            # It's important to invoke self._DataSourceFromFilePattern() inside the
            # lambda to make sure that the record is drawn from data source
            # only if it will be used. Weights are handled by MixByWeight, not the
            # data_source_from_file_pattern_fn.
            return lambda: data_source_from_file_pattern_fn(file_pattern)

        if len(p.weights) != len(p.file_patterns):
            raise ValueError(
                'Expected p.file_patterns and p.weights to be the same length. '
                'Found %d file_patterns, and %d weights' %
                (len(p.file_patterns), len(p.weights)))
        if not all(isinstance(x, six.string_types) for x in p.file_patterns):
            raise ValueError(
                'Expected all elements of p.file_patterns to be strings')

        # TODO(rosenberg) replace this with functools.partial
        inputs = [
            _MakeDataSourceFromFilePatternFunc(
                data_source_from_file_pattern_fn, file_pattern)
            for file_pattern in p.file_patterns
        ]
        weights = p.weights
        if not p.bprop_variable_filters:
            bprop_variable_filters = [''] * len(inputs)
        else:
            bprop_variable_filters = p.bprop_variable_filters

        data_source, selected_bprop = py_utils.MixByWeight(inputs,
                                                           weights,
                                                           seed=p.random_seed)
        # TODO(neerajgaur): Remove _bprop_onehot and change code that uses it to
        # use source_selected from input_batch.
        batch_size = py_utils.GetShape(tf.nest.flatten(data_source)[0])[0]
        ret = py_utils.NestedMap()
        ret.data = data_source
        ret.bprop_variable_filters = bprop_variable_filters
        ret.selected_bprop = selected_bprop
        ret.source_selected = tf.tile(tf.expand_dims(selected_bprop, 0),
                                      [batch_size, 1])
        return ret
    def FProp(self, theta, input_batch):
        """Embeds source ids and transforms with TransformerStack.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      input_batch: A `.NestedMap` object containing: ids - The inputs tensor of
        shape [batch, time]. paddings - The ids' paddings of shape [batch,
        time].

    Returns:
      A '.NestedMap' object containing:
        encoded - The encoded features of shape [time, batch, dim] or [batch,
          time, dim], depending p.output_data_format.
        padding - The encoded features' padding of shape [time, batch] or
          [batch, time].
        segment_id - The segmentation of packed inputs of shape [time, batch] or
          [batch, time] if it is supported by the model, or None otherwise.
        embedded_inputs - The embedded inputs tokens without positional
          encodings of shape [time, batch, dim] or [batch, time, dim].
    """

        p = self.params
        with tf.name_scope(p.name):
            # [batch, time]
            input_ids = input_batch.ids
            # [batch, time]
            paddings = input_batch.paddings

            # [batch, time]
            segment_ids = input_batch.segment_ids if p.packed_input else None

            batch = py_utils.GetShape(input_ids)[0]
            time = py_utils.GetShape(input_ids)[1]

            # Embedding layer.
            # [batch, time, dim]
            if not p.shared_emb:
                input_embs = self.token_emb.EmbLookup(theta.token_emb,
                                                      input_ids)
            else:
                input_embs = self.softmax.EmbLookup(theta.softmax, input_ids)
            orig_input_embs = input_embs

            # [1, time, dim]
            if p.packed_input:
                positions = input_batch.segment_pos
                position_embs = tf.expand_dims(
                    self.position_emb.FPropWithPosition(
                        theta.position_emb, positions), 0)
            else:
                position_embs = tf.expand_dims(
                    self.position_emb.FProp(theta.position_emb, time), 0)

            # [batch, time, dim]
            input_embs += tf.cast(position_embs, tf.bfloat16)

            if p.input_dropout_tpl.fprop_dtype:
                input_embs = tf.cast(input_embs,
                                     p.input_dropout_tpl.fprop_dtype)
                paddings = tf.cast(paddings, p.input_dropout_tpl.fprop_dtype)

            input_embs = self.input_dropout.FProp(theta.input_dropout,
                                                  input_embs)
            # [batch, time, dim]
            transformer_input = input_embs
            # Explicitly set the input shape of Transformer layers, to avoid
            # unknown shape error occurred to tf.einsum on nonTPU devices.
            transformer_input = tf.reshape(transformer_input,
                                           [batch, time, p.model_dim])

            # Compute self-attention segment mask once.
            if p.packed_input:
                segment_mask = batch_major_attention.SegmentMask(
                    segment_ids, segment_ids, dtype=transformer_input.dtype)
            else:
                segment_mask = tf.zeros([batch, 1, time, time])

            encoded, padding = self.transformer_stack.FProp(
                theta.transformer_stack, transformer_input, paddings,
                segment_mask)

            if p.final_layer_norm:
                encoded = self.final_ln.FProp(theta.final_ln, encoded)

            seq_lengths = tf.cast(tf.reduce_sum(1. - padding, axis=1),
                                  tf.int32)

            if p.output_data_format == 'TBC':
                encoded = tf.transpose(encoded,
                                       [1, 0, 2])  # [time, batch, dim]
                padding = tf.transpose(padding)  # [time, batch]
                segment_ids = tf.transpose(
                    segment_ids) if p.packed_input else None
                orig_input_embs = tf.transpose(orig_input_embs, [1, 0, 2])

            return py_utils.NestedMap(
                encoded=encoded,
                padding=padding,
                seq_lengths=seq_lengths,  # used by beam_search_helper.
                segment_id=segment_ids,
                embedded_inputs=orig_input_embs)
    def _GetMask(self,
                 batch_size,
                 choose_range,
                 mask_size,
                 global_seed,
                 max_length=None,
                 masks_per_frame=0.0,
                 multiplicity=1,
                 dtype=tf.float32,
                 max_ratio=1.0):
        """Returns fixed size multi-masks starting from random positions.

    A multi-mask is a mask obtained by applying multiple masks.

    This function when max_length is given:
      1) Sample random mask lengths less than max_length with shape
         (batch_size, multiplicity).
      2) Truncate lengths to a max of (choose_range * max_ratio),
         so that each mask is fully contained within the corresponding sequence.
      3) Random sample start points of shape (batch_size, multiplicity)
         with in (choose_range - lengths).
      4) For each batch, multiple masks (whose number is given by the
         multiplicity) are constructed.
      5) Return a mask of shape (batch_size, mask_size) where masks are
         obtained by composing the masks constructed in step 4).
         If masks_per_frame > 0, the number is given by
         min(masks_per_frame * choose_range, multiplicity).
         If not, all the masks are composed. The masked regions are set to zero.

    This function when max_length is not given:
      1) Sample random mask lengths less than (choose_range * max_ratio)
         with shape (batch_size, multiplicity).
      2) Proceed to steps 3), 4) and 5) of the above.

    Args:
      batch_size: Batch size. Integer number.
      choose_range: Range within which the masked entries must lie. Tensor of
        shape (batch_size,).
      mask_size: Size of the mask. Integer number.
      global_seed: an integer seed tensor for stateless random ops.
      max_length: Maximum number of allowed consecutive masked entries. Integer
        number or None.
      masks_per_frame: Number of masks per frame. Float number. If > 0, the
        multiplicity of the mask is set to be masks_per_frame * choose_range.
      multiplicity: Maximum number of total masks. Integer number.
      dtype: Data type.
      max_ratio: Maximum portion of the entire range allowed to be masked. Float
        number.

    Returns:
      mask: a fixed size multi-mask starting from a random position with shape
      (batch_size, mask_size).
    """
        p = self.params
        # Non-empty random seed values are only used for testing or when using
        # stateless random ops. seed_1 and seed_2 are set separately to avoid
        # correlation of mask size and mask position.
        if p.use_input_dependent_random_seed:
            seed_1 = global_seed + 1
            seed_2 = global_seed + 2
        elif p.random_seed:
            seed_1 = p.random_seed + 1
            seed_2 = 2 * p.random_seed
        else:
            seed_1 = p.random_seed
            seed_2 = p.random_seed
        # Sample lengths for multiple masks.
        if max_length and max_length > 0:
            max_length = tf.broadcast_to(tf.cast(max_length, dtype),
                                         (batch_size, ))
        else:
            max_length = tf.cast(choose_range, dtype=dtype) * max_ratio
        random_uniform = _random_uniform_op(p.use_input_dependent_random_seed)
        masked_portion = random_uniform(shape=(batch_size, multiplicity),
                                        minval=0.0,
                                        maxval=1.0,
                                        dtype=dtype,
                                        seed=seed_1)
        masked_frame_size = self.EinsumBBmBm(max_length, masked_portion)
        masked_frame_size = tf.cast(masked_frame_size, dtype=tf.int32)
        # Make sure the sampled length was smaller than max_ratio * length_bound.
        # Note that sampling in this way was biased
        # (shorter sequence may over-masked.)
        choose_range = tf.expand_dims(choose_range, -1)
        choose_range = tf.tile(choose_range, [1, multiplicity])
        length_bound = tf.cast(choose_range, dtype=dtype)
        length_bound = tf.cast(max_ratio * length_bound, dtype=tf.int32)
        length = tf.minimum(masked_frame_size, tf.maximum(length_bound, 1))

        # Choose starting point.
        random_start = random_uniform(shape=(batch_size, multiplicity),
                                      maxval=1.0,
                                      seed=seed_2)
        start_with_in_valid_range = random_start * tf.cast(
            (choose_range - length + 1), dtype=dtype)
        start = tf.cast(start_with_in_valid_range, tf.int32)
        end = start + length - 1

        # Shift starting and end point by small value.
        delta = tf.constant(0.1)
        start = tf.expand_dims(tf.cast(start, dtype) - delta, -1)
        start = tf.tile(start, [1, 1, mask_size])
        end = tf.expand_dims(tf.cast(end, dtype) + delta, -1)
        end = tf.tile(end, [1, 1, mask_size])

        # Construct pre-mask of shape (batch_size, multiplicity, mask_size).
        diagonal = tf.expand_dims(
            tf.expand_dims(tf.cast(tf.range(mask_size), dtype=dtype), 0), 0)
        diagonal = tf.tile(diagonal, [batch_size, multiplicity, 1])
        pre_mask = tf.cast(tf.math.logical_and(diagonal < end,
                                               diagonal > start),
                           dtype=dtype)

        # Sum masks with appropriate multiplicity.
        if masks_per_frame > 0:
            multiplicity_weights = tf.tile(
                tf.expand_dims(tf.range(multiplicity, dtype=dtype), 0),
                [batch_size, 1])
            multiplicity_tensor = masks_per_frame * tf.cast(choose_range,
                                                            dtype=dtype)
            multiplicity_weights = tf.cast(
                multiplicity_weights < multiplicity_tensor, dtype=dtype)
            pre_mask = self.EinsumBmtBmBt(pre_mask, multiplicity_weights)
        else:
            pre_mask = tf.reduce_sum(pre_mask, 1)
        mask = tf.cast(1.0 - tf.cast(pre_mask > 0, dtype=dtype), dtype=dtype)

        if p.fprop_dtype is not None and p.fprop_dtype != p.dtype:
            mask = tf.cast(mask, p.fprop_dtype)

        return mask
Beispiel #20
0
 def FProp(self, theta, inputs, paddings):
   paddings_expanded = tf.expand_dims(tf.expand_dims(paddings, -1), -1)
   bned = super(ConvBatchNormLayer, self).FProp(
       theta, inputs, paddings_expanded)
   return bned, paddings
Beispiel #21
0
 def _ApplyPadding(tensor_in, padding_in):
   padding_expanded = tf.expand_dims(tf.expand_dims(padding_in, -1), -1)
   return tensor_in * (1.0 - padding_expanded)
Beispiel #22
0
 def FProp(self, theta, inputs, paddings):
   paddings_expanded = tf.expand_dims(tf.expand_dims(paddings, -1), -1)
   return inputs * (1.0 - paddings_expanded), paddings
 def EinsumBxycBxBxyc(self, a, b, name=None):
     """Portable replacement for tf.einsum('bxyc,bx->bxyc', a, b)."""
     expanded_b = tf.expand_dims(tf.expand_dims(b, axis=2), axis=3)
     return tf.math.multiply(a, expanded_b, name=name)
Beispiel #24
0
    def _CreateCanvasAndTargets(self, batch):
        # pyformat: disable
        """Create the canvas and targets.

    Args:
      batch: A `.NestedMap`.

        - src: A `.NestedMap`.
          - ids: The source ids, ends in <eos>.
          - paddings: The source paddings.

        - tgt: A `.NestedMap`.
          - ids: The target ids, ends in <eos>.
          - paddings: The target paddings.

    Returns:
      A `NestedMap`.
        - canvas: The canvas (based off of the `rollin_policy`) of shape
          [batch_size, c_dim].
        - canvas_paddings: The paddings of `canvas_indices`.
        - target_indices: The target indices (i.e., use these indices to
          tf.gather_nd the log-probs). Optional, only during training.
        - target_weights: The target weights. Optional, only during training.
    """
        # pyformat: enable
        p = self.params

        if not self.do_eval:
            # Sample our src and tgt canvas.
            src_descriptor = self._SampleCanvasAndTargets(
                batch.src.ids, batch.src.paddings)
            tgt_descriptor = self._SampleCanvasAndTargets(
                batch.tgt.ids, batch.tgt.paddings)

            # Offset the src ids (to unshare embeddings between src/tgt). Note, we
            # only offset the canvas ids, but we do not offset the vocab ids. This
            # will result in unshared embeddings, but shared softmax. This is due to
            # GPU/TPU memory limitations, empirically it is known that unsharing
            # everything results in better performance.
            vocab_size = p.decoder.softmax.num_classes
            src_descriptor.canvas = tf.where(
                tf.equal(src_descriptor.canvas_paddings, 0),
                src_descriptor.canvas + vocab_size, src_descriptor.canvas)

            # Offset the tgt indices (need shift according to src length).
            batch_size = py_utils.GetShape(batch.src.ids)[0]
            # `target_batch` is a [num_targets, batch_size] tensor where each row
            # identifies which batch the target belongs to. Note the observation that,
            # tf.reduce_sum(target_batch, 1) == 1 \forall rows.
            target_batch = tf.cast(
                tf.equal(
                    tf.expand_dims(tf.range(batch_size), 0),
                    tf.expand_dims(tgt_descriptor.target_indices[:, 0], 1)),
                tf.int32)
            src_lens = tf.cast(
                tf.reduce_sum(1 - src_descriptor.canvas_paddings, 1), tf.int32)
            # `tgt_offset` is shape [num_targets] where each entry corresponds to the
            # offset needed for that target (due to the source length).
            tgt_offset = tf.matmul(target_batch, tf.expand_dims(src_lens, 1))
            # We shift the tgt slot without touching the batch or vocab.
            tgt_descriptor.target_indices += tf.concat([
                tf.zeros_like(tgt_offset), tgt_offset,
                tf.zeros_like(tgt_offset)
            ], 1)

            # The canvas is simply the sequence-level concat of the src and tgt.
            canvas, canvas_paddings = insertion.SequenceConcat(
                src_descriptor.canvas, src_descriptor.canvas_paddings,
                tgt_descriptor.canvas, tgt_descriptor.canvas_paddings)
            target_indices = tf.concat(
                [src_descriptor.target_indices, tgt_descriptor.target_indices],
                0)
            target_weights = tf.concat(
                [src_descriptor.target_weights, tgt_descriptor.target_weights],
                0)

            return py_utils.NestedMap(canvas=canvas,
                                      canvas_paddings=canvas_paddings,
                                      target_indices=target_indices,
                                      target_weights=target_weights)
def MergeBeamSearchOutputs(max_hyps_per_beam, beam_search_outputs):
    """Merges beam search hyps from multiple decoders.

  Args:
    max_hyps_per_beam: the number of top hyps in the merged results. Must be
      less than or equal to total number of input hyps.
    beam_search_outputs: a list of BeamSearchDecodeOutput objects. Must share
      the same source_batch and max sequence length.

  Returns:
    A BeamSearchDecodeOutput object containing max_hyps_per_beam hypotheses per
    beam.
  """
    source_batch = tf.shape(beam_search_outputs[0].topk_hyps)[0]
    value_dict = {}
    for output in beam_search_outputs:
        hyps_per_beam = py_utils.with_dependencies([
            py_utils.assert_equal(source_batch,
                                  tf.shape(output.topk_hyps)[0]),
        ],
                                                   tf.shape(
                                                       output.topk_hyps)[1])
        for k, v in six.iteritems(output._asdict()):
            if v is None:
                continue
            if k == 'done_hyps':
                v = tf.transpose(v)
            if k not in value_dict:
                value_dict[k] = []
            value_dict[k].append(
                tf.reshape(v, [source_batch, hyps_per_beam, -1]))

    # Concatenate the tensors along the 'num_hyps_per_beam' dimension.
    concatenated = {}
    for k, values in six.iteritems(value_dict):
        if len(values) != len(beam_search_outputs):
            raise ValueError('Incomplete values for %s: %s' %
                             (k, beam_search_outputs))
        concatenated[k] = tf.concat(values, axis=1)

    scores = concatenated['topk_scores']
    scores = tf.where(tf.equal(concatenated['topk_lens'], 0),
                      tf.fill(tf.shape(scores), -1e6), scores)
    scores = tf.squeeze(scores, -1)

    # Select top max_hyps_per_beam indices per beam.
    _, top_indices = tf.nn.top_k(scores, max_hyps_per_beam)
    batch_ids = tf.tile(tf.expand_dims(tf.range(source_batch), -1),
                        [1, max_hyps_per_beam])
    # [source_batch, max_hyps_per_beam, 2]
    gather_indices = tf.stack([batch_ids, top_indices], axis=-1)

    # Gather the merged top hyps according to 'gather_indices'.
    top = beam_search_outputs[0]._asdict()
    total_hyps = source_batch * max_hyps_per_beam
    for k, v in six.iteritems(concatenated):
        v = tf.gather_nd(v, gather_indices)
        if k == 'done_hyps':
            v = tf.transpose(tf.reshape(v, [total_hyps, -1]))
        elif k == 'topk_hyps':
            v = tf.reshape(v, [source_batch, max_hyps_per_beam])
        elif k == 'topk_ids':
            v = tf.reshape(v, [total_hyps, -1])
        elif k in ('topk_lens', 'topk_scores', 'topk_decoded'):
            v = tf.reshape(v, [total_hyps])
        else:
            raise ValueError('Unexpected field: %s' % k)
        top[k] = v
    return BeamSearchDecodeOutput(**top)
def AddAttentionSummaryBatchMajor(name,
                                  attention_tensors,
                                  src_paddings,
                                  tgt_paddings,
                                  transcripts=None,
                                  max_outputs=3):
  """Adds an image summary showing the attention probability matrix and state.

  As opposed to AddAttentionSummary() takes all tensors with batch dimension in
  axis 0.

  Args:
    name: Summary name.
    attention_tensors: A list of 3D tensors shaped [batch_size, target_len,
      source_len] where attention[b, i, j] is the probability for the i-th
      output attending to the j-th input for element b in the batch.
    src_paddings: A tensor of binary paddings shaped [batch, source_len] for the
      source sequence. Or a list of tensors of the same length as
      attention_tensors with a separate paddings for each entry in
      attention_tensors.
    tgt_paddings: A tensor of binary paddings shaped [batch, target_len] for the
      target sequence. Or a list of tensors of the same length as
      attention_tensors with a separate paddings for each entry in
      attention_tensors.
    transcripts: Optional, transcripts shaped [batch, source_len] for the source
      sequence.
    max_outputs: Integer maximum number of elements of the batch to plot.
  """

  def VerifyLen(paddings):
    length = len(paddings) if isinstance(paddings, list) else 1
    if length != 1 and length != len(attention_tensors):
      raise ValueError('Bad length of paddings list {}'.format(length))

  VerifyLen(src_paddings)
  VerifyLen(tgt_paddings)

  if not _ShouldAddSummary():
    return

  def ToLengths(paddings):
    paddings = paddings if isinstance(paddings, list) else [paddings]
    return [SequenceLength(p) for p in paddings]

  def Get(lengths, i):
    return lengths[0 if len(lengths) == 1 else i]

  src_lens = ToLengths(src_paddings)
  tgt_lens = ToLengths(tgt_paddings)

  with plot.MatplotlibFigureSummary(
      name + '/Attention',
      max_outputs=max_outputs,
      gridspec_kwargs={'hspace': 0.3}) as fig:
    for n, atten in enumerate(attention_tensors):
      # Diagnostic metric that decreases as attention picks up.
      max_entropy = tf.math.log(tf.cast(Get(src_lens, n), tf.float32))
      max_entropy = tf.expand_dims(tf.expand_dims(max_entropy, -1), -1)
      atten_normalized_entropy = -atten * tf.math.log(atten +
                                                      1e-10) / max_entropy
      scalar(name + '/Attention/average_normalized_entropy/%d' % n,
             tf.reduce_mean(atten_normalized_entropy))
      args = [atten, Get(src_lens, n), Get(tgt_lens, n)]
      if transcripts is not None and n == 0:
        args.append(transcripts)
      fig.AddSubplot(
          args,
          TrimPaddingAndPlotAttention,
          title=name,
          xlabel='Input',
          ylabel='Output')
def beam_search_step(in_scores,
                     in_atten_probs,
                     in_best_scores,
                     in_cumulative_scores,
                     in_histories,
                     cur_step,
                     eos_id,
                     num_beams,
                     beam_size,
                     num_hyps_per_beam,
                     valid_eos_max_logit_delta=5.0,
                     local_eos_threshold=-100.0,
                     merge_paths=False,
                     is_last_chunk=None,
                     eoc_id=0):
    """A single step of beam search.

  Let "b" be the number of beams, "k" be the number hyps in each beam. This
  function supports values with dtypes tf.float32 or tf.bfloat16.

  The following data structures are allocated before the first decoding step and
  are passed along from cur step to the next step:

  Args:
    in_scores: A tensor of shape [b * k, vocab_size], where [i, ...] is the
      token score of the j-th hyps of the n-th beam. j = (i / k), and n = i % k
    in_atten_probs: A tensor of shape [b*k, s_len], where in_atten_probs[i, ...]
      is the attention probabilities over the source words of the j-th hyps of
      n-th beam (where j, and n are derived as above).
    in_best_scores: A vector of size [b], best scores of terminated hyps so far
      in each of the beams.
    in_cumulative_scores: A vector of size [b * k]. The cumulative score of each
      active hyp before the current step.
    in_histories: An int32 vector of size [b * k] containing hashes of the
      histories of each active hyp. If 'merge_paths' is enabled, the histories
      are used to identify hypotheses that are identical modulo epsilons (e.g.
      "a <eps> b" and "a b <eps>") and merge them. See 'update_histories'
      docstring for details.
    cur_step: Current step id.
    eos_id: Token id of the special end of sequence token.
    num_beams: Number of beams.
    beam_size: Search terminates if the delta between the scores of the active
      hyps.
    num_hyps_per_beam: Number of hyps in a beam.
    valid_eos_max_logit_delta: We allow </s> to terminate a hyp only if its
      logit is no more than 'valid_eos_max_logit_delta' away from the logit of
      the best candidate.
    local_eos_threshold: We allow </s> to terminate a hyp if the local score for
      </s> is greater than local_eos_threshold.
    merge_paths: If true, hyps which are identical when epsilons are removed
      will be combined into a single hyp.  The probability for that combined hyp
      will be the sum of the probabilities of the component hyps.  This can only
      be applied for epsilon-emitting models (RNN-T and NT).
    is_last_chunk: A tensor of shape [b * k, 1]. Used by neural transducer,
      determines whether the current hypothesis reaches the last chunk and
      should treat the next end-of-chunk symbol as end-of-sentence.
    eoc_id: int, the id of the end of chunk (a.k.a epsilon) token used by neural
      transducer models. Only relevant if 'merge_paths' is True or
      'is_last_chunk' is provided.

  Returns:
    out_best_scores: A tensor of shape [b] of updated best scores for each of
      the beams.
    out_cumulative_scores: A tensor of shape [b * k]. The cumulative score of
      the new hyps after the current decoding step.
    out_scores: A tensor of shape [b * k] with scores of the token selected.
    out_eos_scores: A tensor of shape [b * k] with token scores for the EOS, in
      case the hyp was terminated, otherwise 0.0.
    out_hyps: A tensor of shape [b * k] with ids of the token selected.
    out_prev_hyps: A tensor of shape [b * k] with index to the previous hyps
      which was selected.
    out_done_hyps: A boolean tensor of shape [b * k] where value indicates
      if hyps was terminated.
    out_atten_probs: A tensor of shape [b * k, seq_len] which contain the
      attention probabilities over the source words against word in the previous
      hyps.
    out_eos_atten_probs: A tensor of shape [b * k, seq_len] which contains the
      attention probabilities over the source against word in the current hyp
      which was terminated.
    out_all_done: A scalar, whether decoding should terminate for all beams.
    out_histories: A tensor of shape [b * k] containing new history hashes for
      the active hypotheses. See 'update_histories' docstring for details.
  Raises:
    ValueError: if inputs are invalid.
  """
    num_hyps_per_beam = int(num_hyps_per_beam)

    if num_hyps_per_beam <= 0:
        raise ValueError("num_hyps_per_beam = {} and must be > 0.".format(
            num_hyps_per_beam))

    in_scores = tf.convert_to_tensor(in_scores)
    in_scores.shape.assert_has_rank(2)
    num_classes = in_scores.get_shape()[1]

    in_atten_probs = tf.convert_to_tensor(in_atten_probs)
    in_atten_probs.shape.assert_has_rank(2)

    in_best_scores = tf.convert_to_tensor(in_best_scores)
    in_best_scores.shape.assert_has_rank(1)

    in_cumulative_scores = tf.convert_to_tensor(in_cumulative_scores)
    in_cumulative_scores.shape.assert_has_rank(1)

    in_histories = tf.convert_to_tensor(in_histories)
    in_histories.shape.assert_has_rank(1)

    with tf.name_scope("beam_search_step"):
        # For k = num_hyps_per_beam
        # First step of beam search is to find the top tokens based on its score.
        # Normally we select k+1, where the extra +1 is to make sure we have k
        # non-eos tokens to select if EOS token is in the top-k. If path merging is
        # on, we actually need to select k+2; this ensures there are k+1 tokens left
        # after the merge, at least k of which are not EOS.
        # TODO(b/118644069): Avoid casts when there is a XLA op available that takes
        # in bfloat16.
        num_candidates_per_input_hyp = (num_hyps_per_beam + 2 if merge_paths
                                        else num_hyps_per_beam + 1)
        # [b * k, num_candidates_per_input_hyp]
        local_score_values, local_indices = xla_ops.top_k_with_unique(
            tf.cast(in_scores, tf.float32), k=num_candidates_per_input_hyp)
        local_score_values = tf.cast(local_score_values, in_scores.dtype)

        # Compute the global score which is sum of the local score, and the
        # cumulative scores for each of the hyps.
        # [b * k, num_candidates_per_input_hyp]
        global_score_values = local_score_values + tf.expand_dims(
            in_cumulative_scores, 1)

        values_dtype = local_score_values.dtype
        is_first_step = tf.cast(tf.equal(cur_step, 0), values_dtype)

        # Preprocessing to reorder the tensor from `mod` sharding to `div` so that
        # we can use matrix/vector operations to complete the beam search.
        # [b * k, num_candidates_per_input_hyp]
        global_score_values = reorder_tensor("mod_to_div", global_score_values,
                                             num_beams, num_hyps_per_beam)
        local_score_values = reorder_tensor("mod_to_div", local_score_values,
                                            num_beams, num_hyps_per_beam)
        local_indices = reorder_tensor("mod_to_div",
                                       local_indices,
                                       num_beams,
                                       num_hyps_per_beam,
                                       max_value=num_classes - 1)
        # [b * k, 1]
        histories = reorder_tensor("mod_to_div",
                                   tf.expand_dims(in_histories, 1), num_beams,
                                   num_hyps_per_beam)
        if is_last_chunk is None:
            is_last_chunk = tf.zeros([num_beams * num_hyps_per_beam, 1],
                                     tf.bool)
        else:
            is_last_chunk = tf.cast(
                reorder_tensor(
                    "mod_to_div",
                    tf.reshape(is_last_chunk,
                               [num_beams * num_hyps_per_beam, 1]), num_beams,
                    num_hyps_per_beam), tf.bool)

        # For the first step mask everything but the first row.
        # [num_hyps_per_beam]
        per_example_mask = tf.concat([
            tf.constant([1.0], dtype=values_dtype),
            tf.zeros([num_hyps_per_beam - 1], dtype=values_dtype)
        ], 0)
        # [num_hyps_per_beam, num_beams] => [b*k, 1]
        mask = tf.reshape(
            tf.tile(per_example_mask, tf.expand_dims(num_beams, 0)),
            [-1, 1]) * is_first_step + (1.0 - is_first_step)
        local_score_values *= mask
        global_score_values *= mask

        # We add a large negative value for the unmasked values.
        per_example_additive_mask = tf.concat([
            tf.constant([0.0], dtype=values_dtype),
            tf.constant(BEST_SCORES_INIT,
                        shape=[num_hyps_per_beam - 1],
                        dtype=values_dtype)
        ], 0)
        additive_mask = tf.reshape(
            tf.tile(per_example_additive_mask, tf.expand_dims(num_beams, 0)),
            [-1, 1]) * is_first_step
        local_score_values += additive_mask
        global_score_values += additive_mask

        if merge_paths:
            with tf.name_scope("merge_paths"):
                # Compute new history hashes for each hypothesis + new token.
                # [b * k, num_candidates_per_input_hyp]
                histories = update_histories(histories,
                                             local_indices,
                                             mask,
                                             epsilon_id=eoc_id)
                global_score_values, histories = merge_hyps(
                    global_score_values, histories, mask, num_beams,
                    num_hyps_per_beam)

        # As we keep num_candidates_per_input_hyp, we have a total of
        # num_candidates_per_input_hyp * k hyps active per example.
        num_candidate_hyps = num_candidates_per_input_hyp * num_hyps_per_beam
        batch_shape = [-1, num_candidate_hyps]

        # Reshape score values so that each row corresponds to a particular example.
        # [num_beams, num_candidate_hyps]
        global_score_values_batch = tf.reshape(global_score_values,
                                               batch_shape)

        # First for each beam: Find the top 2 * num_hyps_per_beam candidates.
        # The factor of 2 is to be able to process non EOS token ids in the case
        # where top scoring token for each hyps is EOS token.
        # [k * b, 2 * num_hyps_per_beam]
        _, candidates_indices_in_top_k = xla_ops.top_k_with_unique(
            tf.cast(global_score_values_batch, tf.float32),
            k=2 * num_hyps_per_beam)
        # Find the previous hyps of the candidate. We divide here by (k+1) to
        # identify which hyps this token came from.
        hyps_id = candidates_indices_in_top_k // num_candidates_per_input_hyp

        # Add in offset so that we can get the candidate index in the [b * k] space.
        offset = tf.expand_dims(tf.range(num_beams) * num_candidate_hyps, 1)
        flat_candidates_indices_in_top_k = tf.reshape(
            candidates_indices_in_top_k + offset, [-1])

        flat_local_indices = tf.reshape(local_indices, [1, -1])
        flat_token_scores = tf.reshape(local_score_values, [-1, 1])
        flat_global_scores = tf.reshape(global_score_values, [-1, 1])

        # Gather the token scores for each of 2*k candidates. We use tf.one_hot()
        # followed by a tf.matmul() to speedup gather on TPUs.
        total_num_candidates = num_beams * num_candidate_hyps
        token_scores_for_beam = tf.reshape(
            fast_gather(flat_token_scores, flat_candidates_indices_in_top_k,
                        total_num_candidates),
            [num_beams, 2 * num_hyps_per_beam])
        token_scores_for_beam_shape = tf.shape(token_scores_for_beam)

        global_scores_for_beam = tf.reshape(
            fast_gather(flat_global_scores, flat_candidates_indices_in_top_k,
                        total_num_candidates), token_scores_for_beam_shape)

        # Local indices value's are between [0, vocab_size-1], hence we use the
        # slower version of gather.
        token_ids_for_beam = tf.reshape(
            fast_gather(flat_local_indices,
                        flat_candidates_indices_in_top_k,
                        total_num_candidates,
                        max_value=num_classes - 1,
                        axis=1), token_scores_for_beam_shape)

        # We have access to 2*num_hyps_per_beam hyps per beam.
        # We shrink back to num_hyps_per_beam that does not include EOS, and move
        # EOS that occurs in top-num_hyps_per_beam to the EOS done matrix.

        # To determine the threshold at which eos is allowed to terminate a hyp,
        # we need to know the maximum global score for that hyp with any additional
        # token. If path merging is *not* enabled, the global_score_values are
        # by construction in sorted order, so we can just look at its 0th column. If
        # path merging is enabled, the global scores of deleted (merged) hyps break
        # the sorted order, which means we have to do a full reduce_max.
        if merge_paths:
            max_global_score_per_input_hyp = tf.reduce_max(global_score_values,
                                                           axis=1,
                                                           keepdims=True)
        else:
            max_global_score_per_input_hyp = global_score_values[:, 0:1]
        # [num_beams * num_hyps_per_beam, 1]
        global_eos_threshold = (max_global_score_per_input_hyp -
                                valid_eos_max_logit_delta)
        local_eos_threshold_tensor = local_eos_threshold * tf.ones_like(
            global_eos_threshold)

        # Find EOS in top num_hyps_per_beam token ids. We also treat EOC as EOS if
        # the model has indicated this is the last chunk.
        local_index_is_eos = tf.equal(local_indices, eos_id)
        local_index_is_last_chunk_eoc = tf.math.logical_and(
            tf.equal(local_indices, eoc_id), is_last_chunk)
        eos_mask = tf.math.logical_and(
            tf.math.logical_and(
                tf.math.logical_and(
                    tf.greater(
                        local_score_values,
                        tf.tile(local_eos_threshold_tensor,
                                [1, num_candidates_per_input_hyp])),
                    tf.greater(
                        global_score_values,
                        tf.tile(global_eos_threshold,
                                [1, num_candidates_per_input_hyp]))),
                tf.math.logical_or(local_index_is_eos,
                                   local_index_is_last_chunk_eoc)),
            tf.cast(mask, tf.bool))
        end_hyps_bool_mask = tf.reshape(tf.reduce_any(eos_mask, 1), [-1, 1])

        end_hyps_bool_mask = reorder_tensor("div_to_mod", end_hyps_bool_mask,
                                            num_beams, num_hyps_per_beam)

        eos_atten_probs = in_atten_probs * tf.cast(end_hyps_bool_mask,
                                                   in_atten_probs.dtype)
        eos_atten_probs = tf.reshape(eos_atten_probs,
                                     [num_beams * num_hyps_per_beam, -1])
        # A boolean tensor of shape [b * k] where value indicates if hyps was
        # terminated.
        out_done_hyps = tf.reshape(end_hyps_bool_mask, [-1])

        # Scores for EOS token.
        eos_float_mask = tf.cast(eos_mask, values_dtype)
        eos_local_scores = eos_float_mask * local_score_values
        eos_additive_float_mask = (1.0 - eos_float_mask) * BEST_SCORES_INIT
        eos_local_scores += eos_additive_float_mask
        out_eos_scores = tf.reshape(tf.reduce_max(eos_local_scores, 1),
                                    [-1, 1])
        out_eos_scores = tf.reshape(
            reorder_tensor("div_to_mod", out_eos_scores, num_beams,
                           num_hyps_per_beam), [-1])
        # A tensor of shape [b] of updated best scores for each of the beams.
        eos_global_scores = eos_float_mask * global_score_values
        eos_global_scores += eos_additive_float_mask
        best_scores = tf.reduce_max(
            tf.reshape(eos_global_scores, [num_beams, -1]), 1)

        # Following operations are to finds the top num_hyps_per_beam that are
        # active.

        # Active ones are the ones that do not correspond to EOS termination.
        # We keep num_hyps_per_beam * 2 in case every hyps is terminated by EOS id.
        # Top K with eos removed.
        non_eos_mask = tf.not_equal(token_ids_for_beam, eos_id)
        num_candidate_hyps = num_hyps_per_beam * 2 * num_beams
        index = tf.where(
            non_eos_mask,
            tf.reshape(tf.range(num_candidate_hyps, dtype=tf.int32),
                       token_scores_for_beam_shape),
            num_candidate_hyps *
            tf.ones(dtype=tf.int32, shape=token_scores_for_beam_shape))

        # Unrolled TopK.
        sorted_indices = []
        # Finds the first num_hyps_per_beam unmasked indexes and stores them in
        # concated_index (shape: [num_beams, num_candidate_hyps])
        # This is done by iteratively record the min index in each row, and reset
        # it to the max, so that next iteration reduce_min returns the 2nd minimum
        # index.
        for _ in range(num_hyps_per_beam):
            min_index = tf.reshape(tf.reduce_min(index, [1]), [num_beams, 1])
            sorted_indices.append(min_index)
            # Replace position with num_candidate_hyps value.
            index = tf.where(
                tf.equal(index, min_index),
                num_candidate_hyps *
                tf.ones(dtype=tf.int32, shape=token_scores_for_beam_shape),
                index)

        # Post processing ops to output expected tensors.
        concated_sorted_indices = tf.concat(sorted_indices, 1)
        flat_sorted_indices = tf.reshape(concated_sorted_indices, [-1])

        # A tensor of shape [b * k] with scores of the token selected.
        out_scores = tf.reshape(
            fast_gather(tf.reshape(token_scores_for_beam, [-1, 1]),
                        flat_sorted_indices, num_candidate_hyps), [-1, 1])
        out_scores = tf.reshape(
            reorder_tensor("div_to_mod", out_scores, num_beams,
                           num_hyps_per_beam), [-1])

        # Gather the updated histories of selected hypotheses if path merging is
        # enabled. Otherwise, the histories are unused, so just output in_histories.
        if merge_paths:
            flat_histories = tf.reshape(histories, [-1, 1])
            # [num_beams, 2 * num_hyps_per_beam]
            histories_for_beam = tf.reshape(
                fast_gather(flat_histories, flat_candidates_indices_in_top_k,
                            total_num_candidates), token_scores_for_beam_shape)
            out_histories = tf.reshape(
                fast_gather(tf.reshape(histories_for_beam, [-1, 1]),
                            flat_sorted_indices, num_candidate_hyps), [-1, 1])
            out_histories = tf.reshape(
                reorder_tensor("div_to_mod", out_histories, num_beams,
                               num_hyps_per_beam), [-1])
        else:
            out_histories = in_histories

        prev_hyps_ids = tf.reshape(
            tf.reshape(
                fast_gather(tf.reshape(hyps_id, [1, -1]),
                            flat_sorted_indices,
                            num_candidate_hyps,
                            max_value=num_hyps_per_beam,
                            axis=1), [num_beams, -1]) * num_beams +
            tf.expand_dims(tf.range(num_beams), 1), [-1, 1])

        prev_hyps_ids = reorder_tensor("div_to_mod",
                                       prev_hyps_ids,
                                       num_beams,
                                       num_hyps_per_beam,
                                       max_value=num_hyps_per_beam)
        # A tensor of shape [b * k] with index to the previous hyps which was
        # selected.
        out_prev_hyps = tf.reshape(prev_hyps_ids, [-1])

        # A tensor of shape [b * k, seq_len] which contain the attention
        # probabilities over the source words against word in the previous hyps.
        out_atten_probs = tf.reshape(
            fast_gather(in_atten_probs, out_prev_hyps,
                        num_beams * num_hyps_per_beam),
            [num_beams * num_hyps_per_beam, -1])

        sorted_top_k_ids = fast_gather(tf.reshape(token_ids_for_beam, [1, -1]),
                                       flat_sorted_indices,
                                       num_candidate_hyps,
                                       max_value=num_classes - 1,
                                       axis=1)
        sorted_top_k_ids = reorder_tensor("div_to_mod",
                                          sorted_top_k_ids,
                                          num_beams,
                                          num_hyps_per_beam,
                                          max_value=num_classes - 1,
                                          axis=1)

        # A tensor of shape [b * k] with ids of the token selected.
        out_hyps = tf.reshape(sorted_top_k_ids, [-1])

        # A tensor of shape [b * k]. The cumulative score of the selected hyps after
        # the current decoding step.
        out_cumulative_scores = tf.reshape(
            fast_gather(tf.reshape(global_scores_for_beam, [-1, 1]),
                        flat_sorted_indices, num_candidate_hyps), [-1, 1])

        out_cumulative_scores = tf.reshape(
            reorder_tensor("div_to_mod", out_cumulative_scores, num_beams,
                           num_hyps_per_beam), [-1])
        out_best_scores = tf.maximum(best_scores, in_best_scores)

        # A scalar, whether decoding should terminate for all beams.
        out_all_done = tf.reshape(
            tf.math.logical_not(
                tf.reduce_any(
                    tf.greater(
                        out_cumulative_scores,
                        tf.reshape(
                            tf.tile(
                                tf.reshape(out_best_scores - beam_size,
                                           [-1, 1]), [1, num_hyps_per_beam]),
                            [-1])))), [])

        return (out_best_scores, out_cumulative_scores, out_scores,
                out_eos_scores, out_hyps, out_prev_hyps, out_done_hyps,
                out_atten_probs, eos_atten_probs, out_all_done, out_histories)
 def EinsumBBmBm(self, a, b, name=None):
     """Portable replacement for tf.einsum('b,bm->bm', a, b)."""
     return tf.math.multiply(tf.expand_dims(a, axis=-1), b, name=name)