Example #1
0
  def FProp(self, theta, input_batch):
    p = self.params
    with tf.name_scope(p.name):
      inputs = py_utils.with_dependencies([
          py_utils.assert_shape_match(tf.shape(input_batch.ids), [-1, -1]),
          py_utils.assert_shape_match(
              tf.shape(input_batch.ids), tf.shape(input_batch.paddings))
      ], tf.transpose(input_batch.ids))
      paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2)
      if p.packed_input:
        src_segment_id = tf.expand_dims(
            tf.transpose(input_batch.segment_ids), 2)
      else:
        src_segment_id = None
      xs = self.emb.EmbLookup(theta.emb, inputs)
      xs = self.ApplyClipping(theta, xs)
      summary_utils.histogram('input_emb', xs)
      xs = self.dropout.FProp(theta.dropout, xs)
      ps = paddings
      # Now the rnn layers.
      outputs_list = []
      for i in range(0, p.num_lstm_layers):
        layer = self.rnn[i]
        ys = layer.FProp(theta.rnn[i], xs, ps, segment_id=src_segment_id)
        ys = self.dropout.FProp(theta.dropout, ys)
        if i >= p.residual_start:
          xs += ys  # Residual skip
          xs = self.ApplyClipping(theta, xs)
        else:
          xs = ys
        outputs_list.append(xs)
        summary_utils.histogram('layer_out_%s' % i, xs)

      if p.is_transparent:
        xs = self.transparent_merger.FProp(theta.transparent_merger,
                                           outputs_list)

      if p.lstm_cell_size * 2 != p.encoder_out_dim:
        # Project to the right depth.
        xs = self.final_proj.FProp(theta.final_proj, xs, ps)
        summary_utils.histogram('final_proj_out', xs)

      if src_segment_id is not None:
        src_segment_id = tf.squeeze(src_segment_id, [2])

      return py_utils.NestedMap(
          encoded=xs, padding=tf.squeeze(ps, [2]), segment_id=src_segment_id)
Example #2
0
    def _update_mask(self, weights, threshold):
        """Updates the mask for a given weight tensor.

    This functions first computes the cdf of the weight tensor, and estimates
    the threshold value such that 'desired_sparsity' fraction of weights
    have magnitude less than the threshold.

    Args:
      weights: The weight tensor that needs to be masked.
      threshold: The current threshold value. The function will compute a new
        threshold and return the exponential moving average using the current
        value of threshold

    Returns:
      new_threshold: The new value of the threshold based on weights, and
        sparsity at the current global_step
      new_mask: A numpy array of the same size and shape as weights containing
        0 or 1 to indicate which of the values in weights falls below
        the threshold

    Raises:
      ValueError: if sparsity is not defined
    """
        if self._sparsity is None:
            raise ValueError('Sparsity variable undefined')

        sparsity = self._get_sparsity(weights.op.name)
        with tf.name_scope(weights.op.name + '_pruning_ops'):
            abs_weights = tf.abs(weights)
            k = tf.cast(
                tf.round(
                    tf.cast(tf.size(abs_weights), tf.float32) *
                    (1 - sparsity)), tf.int32)
            # Sort the entire array
            values, _ = tf.nn.top_k(tf.reshape(abs_weights, [-1]),
                                    k=tf.size(abs_weights))
            # Grab the (k-1) th value
            current_threshold = tf.gather(values, k - 1)
            smoothed_threshold = tf.add_n([
                tf.multiply(current_threshold, 1 - self._spec.threshold_decay),
                tf.multiply(threshold, self._spec.threshold_decay)
            ])

            new_mask = tf.cast(
                tf.greater_equal(abs_weights, smoothed_threshold), tf.float32)

        return smoothed_threshold, new_mask
Example #3
0
  def _FPropLm(self, theta, state0, ids, paddings, misc=None):
    """LM FProp.

    Works for single step or entire seq.

    Args:
      theta: A NestedMap object containing weights for the layer and its
        children.
      state0: A NestedMap of states (specific to the layer).
      ids: Target ids, of shape [batch_size] for single step unrolling or
        [seq_len, batch_size] for the entire sequence.
      paddings: Target paddings, of the same shape as 'ids'.
      misc: NestedMap of miscellaneous items, which might be needed during
        training.

    Returns:
      (lm_output, state1):

      - lm_output: A NestedMap containing lm output. If 'ids' is 1-D, then
        lm_output should have shape [batch_size, dim]; if it is 2-D then the
        shape should be [seq_len, batch_size, dim].
      - state1: A NestedMap of updated states.
    """
    state1 = state0.DeepCopy()
    if isinstance(ids.shape, tf.TensorShape):
      is_single_step = (ids.shape.rank == 1)
    else:
      is_single_step = len(ids.shape) == 1
    if is_single_step:
      seq_len = 1
    else:
      seq_len = tf.shape(ids)[0]

    self._ModifyLmBeforeFProp(theta, state0, ids, paddings, misc)

    with tf.name_scope('lm'):
      ids = tf.reshape(ids, [seq_len, -1], name='reshape_ids')
      paddings = tf.reshape(paddings, [seq_len, -1], name='reshape_paddings')
      lm_output, state1.lm_states = self.lm.FProp(theta.lm, ids, paddings,
                                                  state0.lm_states)

    if is_single_step:
      # lm outputs have dimension [time, batch, dim]. Since this is only one
      # step, remove time dimension.
      lm_output = lm_output.Transform(lambda v: tf.squeeze(v, axis=0))

    return lm_output, state1
Example #4
0
    def Inference(self):
        """Computes y = w^T x + b. Returns y and x, as outputs and inputs."""
        # Add a dummy file def to the collection
        filename = tf.convert_to_tensor('dummy.txt',
                                        tf.dtypes.string,
                                        name='asset_filepath')
        tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.ASSET_FILEPATHS,
                                       filename)

        with tf.name_scope('inference'):
            x = tf.placeholder(dtype=tf.float32, name='input')
            r = tf.random.stateless_uniform([3],
                                            seed=py_utils.GenerateStepSeedPair(
                                                self.params,
                                                self.theta.global_step))
            y = tf.reduce_sum((self.vars.w + r) * x) + self.vars.b
            return {'default': ({'output': y}, {'input': x})}
Example #5
0
def factorized_pool(input_tensor,
                    window_shape,
                    pooling_type,
                    strides,
                    padding,
                    name=None):
    """Performs m x n pooling through a combination of 1xm and 1xn pooling.

  Args:
    input_tensor: Input tensor. Must be rank 2
    window_shape: Pooling window shape
    pooling_type: Either 'MAX' or 'AVG'
    strides: The stride of the pooling window
    padding: 'SAME' or 'VALID'.
    name: Name of the op

  Returns:
    A rank 2 tensor containing the pooled output

  Raises:
    ValueError: if the input tensor is not rank 2
  """
    if input_tensor.get_shape().ndims != 2:
        raise ValueError('factorized_pool() accepts tensors of rank 2 only')

    [height, width] = input_tensor.get_shape()
    with tf.name_scope(name, 'factorized_pool'):
        input_tensor_aligned = tf.reshape(input_tensor, [1, 1, height, width],
                                          name=input_tensor.op.name +
                                          '_aligned')

        height_pooling = tf.nn.pool(input_tensor_aligned,
                                    window_shape=[1, window_shape[0]],
                                    pooling_type=pooling_type,
                                    strides=[1, strides[0]],
                                    padding=padding)
        swap_height_width = tf.transpose(height_pooling, perm=[0, 1, 3, 2])

        width_pooling = tf.nn.pool(swap_height_width,
                                   window_shape=[1, window_shape[1]],
                                   pooling_type=pooling_type,
                                   strides=[1, strides[1]],
                                   padding=padding)

    return tf.squeeze(tf.transpose(width_pooling, perm=[0, 1, 3, 2]),
                      axis=[0, 1])
Example #6
0
 def _SelfVariableScope(self, params=None, enter_name_scope=True):
     """Internal. Used to ensure the same variable & name scopes are used."""
     if not hasattr(self, '_self_variable_scope'):
         params = params or self.params
         self._parent_variable_scope = tf.get_variable_scope()
         with tf.variable_scope(py_utils.SanitizeScopeKey(
                 params.name)) as scope:
             self._self_variable_scope = scope
     with contextlib.ExitStack() as stack:
         stack.enter_context(
             tf.variable_scope(self._self_variable_scope,
                               auxiliary_name_scope=False))
         if enter_name_scope:
             stack.enter_context(
                 tf.name_scope(
                     self._self_variable_scope.original_name_scope))
         yield stack
        def Step(recurrent_theta, state0, inputs):
            """Computes one decoder step."""
            del inputs
            with tf.name_scope('single_sampler_step'):
                # Compute logits and states.
                bs_result, bs_state1 = pre_step_callback(
                    decoder_theta,
                    recurrent_theta.encoder_outputs,
                    tf.expand_dims(state0.ids, 1),  # [batch, 1].
                    state0.bs_state,
                    num_hyps_per_beam=1)
                batch = tf.shape(bs_result.log_probs)[0]
                state1 = py_utils.NestedMap(timestep=state0.timestep + 1)
                state1.logits = bs_result.log_probs

                if p.top_k > 0:
                    topk_logits, topk_ids = tf.math.top_k(state1.logits,
                                                          k=p.top_k)
                    sample_logits = tf.nn.log_softmax(
                        topk_logits) if p.top_k_renormalize else topk_logits
                else:
                    sample_logits = state1.logits

                # Sample ids from logits. [batch].
                ids = tf.reshape(
                    tf.random.stateless_categorical(
                        sample_logits / p.temperature,
                        num_samples=1,
                        seed=tf.stack(
                            [recurrent_theta.random_seed, state0.timestep]),
                        dtype=state0.ids.dtype,
                        name='sample_next_id'), [batch])
                state1.ids = tf.gather(topk_ids, ids, axis=1,
                                       batch_dims=1) if p.top_k > 0 else ids

                if 'is_last_chunk' in bs_result and p.target_eoc_id >= 0:
                    state1.ids = tf.where(
                        tf.math.logical_and(
                            bs_result.is_last_chunk,
                            tf.equal(state1.ids, p.target_eoc_id)),
                        tf.fill(tf.shape(state1.ids), p.target_eos_id),
                        state1.ids)
                state1.bs_state = post_step_callback(
                    decoder_theta, recurrent_theta.encoder_outputs, state1.ids,
                    bs_state1)
            return state1, py_utils.NestedMap()
Example #8
0
 def FProp(self,
           theta,
           source_vecs,
           source_paddings,
           target_vecs,
           target_paddings,
           source_segment_id,
           target_segment_id,
           transparent_acc,
           transparent_acc_helper,
           source_task_id=None,
           target_task_id=None):
   with tf.name_scope(self.params.name):
     return _common_gpipe_transformer_encoder_fprop(
         self, GPipeEvolvedTransformerEncoderLayer, theta, source_vecs,
         source_paddings, target_vecs, target_paddings, source_segment_id,
         target_segment_id, None, None, source_task_id, target_task_id)
Example #9
0
def TraverseLayer(layer, fn):
    """Traverses the layer tree and invokes fn(node) on each node.

  Args:
    layer: a BaseLayer.
    fn: a function of (layer, layer_theta) -> None.
  """
    if isinstance(layer, (list, tuple)):
        for layer_i in layer:
            TraverseLayer(layer_i, fn)
        return

    with tf.name_scope(layer.params.name):
        fn(layer)
        # Traverse all children in alphabetical order.
        for _, child in sorted(layer.children.items()):
            TraverseLayer(child, fn)
Example #10
0
    def Inference(self):
        """Builds the inference graph.

    Default subgraph should return:

      predicted_bboxes: A [batch_size, num_boxes, 7] float Tensor.

      classification_scores: A [batch_size, num_boxes, num_classes] float
      Tensor.

    Returns:
      A dictionary whose values are a tuple of fetches and feeds.
    """
        p = self.params
        subgraphs = {}
        with tf.name_scope('inference'):
            input_placeholders = self._Placeholders()
            predictions = self.ComputePredictions(self.theta,
                                                  input_placeholders)
            bboxes_and_logits = self._BBoxesAndLogits(input_placeholders,
                                                      predictions)
            predicted_bboxes = bboxes_and_logits.predicted_bboxes
            classification_logits = bboxes_and_logits.classification_logits
            classification_scores = tf.sigmoid(classification_logits)

            _, per_cls_bboxes, per_cls_bbox_scores, per_cls_valid_mask = (
                detection_decoder.DecodeWithNMS(
                    predicted_bboxes,
                    classification_scores,
                    nms_iou_threshold=p.nms_iou_threshold,
                    score_threshold=p.nms_score_threshold,
                    max_boxes_per_class=p.max_nms_boxes,
                    use_oriented_per_class_nms=p.use_oriented_per_class_nms))
            per_cls_bbox_scores *= per_cls_valid_mask

            # TODO(vrv): Fix the inference graph for KITTI, since we need
            # to apply frustum clipping.  This requires customizing the
            # inference placeholders for each model.
            fetches = {
                'per_class_predicted_bboxes': per_cls_bboxes,
                'per_class_predicted_bbox_scores': per_cls_bbox_scores,
                'per_class_valid_mask': per_cls_valid_mask
            }
            subgraphs['default'] = fetches, dict(
                input_placeholders.FlattenItems())
        return subgraphs
Example #11
0
    def ZeroState(self, theta, prepared_inputs, batch_size):
        """Creates a zero state NestedMap for this step.

    Args:
      theta: variables used by sub-steps.
      prepared_inputs: Output from a call to PrepareExternalInputs.
      batch_size: The number of items in the batch that FProp will process.

    Returns:
      A NestedMap of ZeroState results for each sub-step.
    """
        state0 = py_utils.NestedMap()
        with tf.name_scope(self.params.name):
            for seq in self._seq:
                state0[seq.name] = seq.step.ZeroState(
                    theta[seq.name], prepared_inputs[seq.name], batch_size)
        return state0
Example #12
0
    def StreamStep(self, theta, inputs, paddings, state0):
        """Runs single step.

    Args:
      theta: A NestedMap of layer params.
      inputs: [b, 1, d].
      paddings: A 0/1 valued tensor of shape [b, 1].
      state0: A NestedMap of tensors of the same struct as returned by
        zero_state().

    Returns:
      outputs: A NestedMap of tensors consisting:
      padding: the same as input paddings.
      state1: A NestedMap of tensors of the same struct as state0.
    """
        p = self.params
        assert p.is_causal

        state1 = py_utils.NestedMap()
        with tf.name_scope(f'{p.name}/StreamStep'):
            unnormalized_inputs = inputs

            inputs = self.ln.FProp(theta.ln, inputs)
            inputs = self.linear_start.FProp(theta.linear_start, inputs)

            inputs = self._GLU(inputs)

            # TODO(jamesqin): inroduce depthwise conv2d with 3d inputs.
            # TODO(jamesqin): optimize DepthwiseConv1D.StreamStep()
            # [b, t, d] --> [b, t, 1, d]
            inputs = tf.expand_dims(inputs, 2)
            # [b, t, 1, d]
            inputs, paddings, conv_state1 = self.depthwise_conv1d.StreamStep(
                theta.depthwise_conv1d, inputs, paddings, state0.conv_state)
            state1.conv_state = conv_state1
            # [b, t, d]
            inputs = self._NormalizeStep(theta, inputs, paddings, state0,
                                         state1)

            inputs = self._ApplyActivation(inputs, p.conv_activation)

            inputs = self.linear_end.FProp(theta.linear_end, inputs)
            inputs = self.dropout.FProp(theta.dropout, inputs)

            output = inputs + unnormalized_inputs
            return output, paddings, state1
Example #13
0
  def FProp(self, theta, input_batch, state0=None):
    p = self.params
    src_segment_id = None
    with tf.name_scope(p.name):
      # Reshape to [t, b]
      inputs = py_utils.with_dependencies([
          py_utils.assert_shape_match(tf.shape(input_batch.ids), [-1, -1]),
          py_utils.assert_shape_match(
              tf.shape(input_batch.ids), tf.shape(input_batch.paddings))
      ], tf.transpose(input_batch.ids))
      paddings = tf.expand_dims(tf.transpose(input_batch.paddings), 2)

      # Setup streaming states.
      if not state0:
        state0 = self.zero_state(theta, tf.shape(inputs)[1])
      state1 = py_utils.NestedMap(rnn=[None] * p.num_lstm_layers)

      xs = self.emb.EmbLookup(theta.emb, inputs)
      xs = self.ApplyClipping(theta, xs)
      summary_utils.histogram('input_emb', xs)
      xs = self.dropout.FProp(theta.dropout, xs)
      ps = paddings
      # Now the rnn layers.
      outputs_list = []
      for i in range(0, p.num_lstm_layers):
        layer = self.rnn[i]
        ys, state1.rnn[i] = layer.FProp(
            theta.rnn[i], xs, ps, state0=state0.rnn[i])
        ys = self.dropout.FProp(theta.dropout, ys)
        if i >= p.residual_start:
          xs += ys  # Residual skip
          xs = self.ApplyClipping(theta, xs)
        else:
          xs = ys
        outputs_list.append(xs)
        summary_utils.histogram('layer_out_%s' % i, xs)

      if p.is_transparent:
        xs = self.transparent_merger.FProp(theta.transparent_merger,
                                           outputs_list)

      return py_utils.NestedMap(
          encoded=xs,
          padding=tf.squeeze(ps, [2]),
          segment_id=src_segment_id,
          state=state1)
Example #14
0
 def FProp(self, theta, source_vecs, source_paddings, target_vecs,
           target_paddings, source_segment_id, target_segment_id, labels,
           label_weights, transparent_acc, transparent_acc_helper):
     p = self.params
     with tf.name_scope(p.name):
         if p.has_aux_atten:  # Decoder FProp
             return _common_gpipe_transformer_decoder_fprop(
                 self, GPipeTransformerLayer, theta, source_vecs,
                 source_paddings, target_vecs, target_paddings,
                 source_segment_id, target_segment_id, labels,
                 label_weights, transparent_acc, transparent_acc_helper)
         else:  # Encoder FProp
             return _common_gpipe_transformer_encoder_fprop(
                 self, GPipeTransformerLayer, theta, source_vecs,
                 source_paddings, target_vecs, target_paddings,
                 source_segment_id, target_segment_id, labels,
                 label_weights, transparent_acc, transparent_acc_helper)
 def Inference(self):
   if py_utils.use_tpu():
     raise NotImplementedError('TPU is not supported.')
   with tf.name_scope('inference'):
     feed1 = tf.placeholder(name='feed1_node', dtype=tf.float32, shape=[1])
     fetch1 = tf.identity(feed1, name='fetch1_node')
     return {
         'default': (
             py_utils.NestedMap({
                 'fetch1': fetch1,
                 'fetch_op': fetch1.op,  # Tests that ops are supported.
             }),
             py_utils.NestedMap({
                 'feed1': feed1,
             })),
         'unused': (py_utils.NestedMap({}), py_utils.NestedMap({})),
     }
Example #16
0
  def FProp(self, theta, inputs, *args):
    p = self.params
    with tf.name_scope(p.name) as scope:
      expert_dist = self._GetExpertDist(theta, inputs, *args)
      if not self.do_eval:
        summary_utils.histogram('soft_cond_{}'.format(scope), expert_dist)

      # Excludes non-variable extra_theta like global_step.
      var_set = set([key for key, _ in self.body.vars.FlattenItems()])
      values = []
      for key, value in theta.body.FlattenItems():
        if key in var_set and value is not None:
          # Weighted average for all variables created in the body layer.
          value = tf.einsum('i,i...->...', expert_dist, value)
        values.append(value)
      weighted_theta = theta.body.Pack(values)
      return self.body.FProp(weighted_theta, inputs, *args)
Example #17
0
    def _InferenceSubgraph_Default(self):
        """Constructs graph for offline inference.

    Returns:
      (fetches, feeds) where both fetches and feeds are dictionaries. Each
      dictionary consists of keys corresponding to tensor names, and values
      corresponding to a tensor in the graph which should be input/read from.
    """
        p = self.params
        with tf.name_scope('default'):
            # TODO(laurenzo): Once the migration to integrated frontends is complete,
            # this model should be upgraded to use the MelAsrFrontend in its
            # params vs relying on pre-computed feature generation and the inference
            # special casing.
            wav_bytes = tf.placeholder(dtype=tf.string, name='wav')
            frontend = self.frontend if p.frontend else None
            if not frontend:
                # No custom frontend. Instantiate the default.
                frontend_p = asr_frontend.MelAsrFrontend.Params()
                frontend = frontend_p.Instantiate()

            # Decode the wave bytes and use the explicit frontend.
            unused_sample_rate, audio = audio_lib.DecodeWav(wav_bytes)
            audio *= 32768
            # Remove channel dimension, since we have a single channel.
            audio = tf.squeeze(audio, axis=1)
            # Add batch.
            audio = tf.expand_dims(audio, axis=0)
            input_batch_src = py_utils.NestedMap(src_inputs=audio,
                                                 paddings=tf.zeros_like(audio))
            input_batch_src = frontend.FPropDefaultTheta(input_batch_src)

            encoder_outputs = self.encoder.FPropDefaultTheta(input_batch_src)
            decoder_outputs = self.decoder.BeamSearchDecode(encoder_outputs)
            topk = self._GetTopK(decoder_outputs)

            feeds = {'wav': wav_bytes}
            fetches = {
                'hypotheses': topk.decoded,
                'scores': topk.scores,
                'src_frames': input_batch_src.src_inputs,
                'encoder_frames': encoder_outputs.encoded
            }

            return fetches, feeds
Example #18
0
    def FProp(self, theta, external_inputs, step_inputs, padding, state0):
        """A single inference step for this step graph.

    Args:
      theta: variables used by sub-steps.
      external_inputs: A NestedMap containing external_inputs that were
        pre-processed by the PrepareExternalInputs method of each sub-step. The
        keys are the names of the sub-steps.
      step_inputs: A NestedMap of [batch, ...] tensors. The structure of this
        depends on the graph implementation.
      padding: A 0/1 float tensor of shape [batch_size]; 1.0 means that this
        batch element is empty in this step.
      state0: A NestedMap of state variables produced by either ZeroState or a
        previous invocation of this FProp step. The keys are the names of the
        sub-steps.

    Returns:
      (output, state1), both of which are NestedMaps.
      output is implementation-dependent and is defined by the output_signature
      parameter.
      state1 is a NestedMap where the keys are names of sub-steps and the values
      are state outputs from their FProp methods.
    """
        p = self.params
        graph_tensors = builder_layers.GraphTensors()
        graph_tensors.StoreTensor('external_inputs', external_inputs)
        graph_tensors.StoreTensor('step_inputs', step_inputs)
        state1 = py_utils.NestedMap()
        with tf.name_scope(p.name):
            for seq in self._seq:
                tf.logging.vlog(1, 'GraphStep: call %s', seq.name)
                external = None
                if seq.external_signature:
                    external = external_inputs[seq.name]
                template = py_utils.NestedMap(inputs=seq.signature.inputs)
                packed = template.Transform(graph_tensors.GetTensor)
                input_args = packed.inputs[0]
                out, seq_state1 = seq.step.FProp(theta[seq.name], external,
                                                 input_args, padding,
                                                 state0[seq.name])
                graph_tensors.StoreTensor(seq.signature.outputs[0], out)
                state1[seq.name] = seq_state1
        template = py_utils.NestedMap(inputs=self.output_signature.inputs)
        output_tensors = template.Transform(graph_tensors.GetTensor).inputs[0]
        return output_tensors, state1
Example #19
0
    def FProp(self, theta, inputs, paddings=None):
        """Apply batch normalization.

    Args:
      theta: A `.NestedMap` object containing weights' values of this layer and
        its children layers.
      inputs: The inputs tensor.  Shaped [..., dim].
      paddings: The paddings tensor.  Shaped [..., 1], with the same rank as the
        input tensor.

    Returns:
      Output after applying batch normalization, with the same shape as
      'inputs'.
    """
        p = self.params
        if paddings is None:
            paddings = self._GetDefaultPaddings(inputs)
        with tf.name_scope(p.name):
            norm_mean, norm_variance, beta, gamma = self.ComputeAndUpdateMoments(
                theta, inputs, paddings)
            with tf.control_dependencies([
                    py_utils.assert_greater_equal(
                        norm_variance, tf.zeros_like(norm_variance)),
                    py_utils.assert_shape_match([tf.shape(inputs)[-1]],
                                                tf.shape(norm_mean)),
                    py_utils.assert_shape_match([tf.shape(inputs)[-1]],
                                                tf.shape(norm_variance)),
            ]):
                if p.use_fused_batch_norm_for_eval and self.do_eval:
                    bn_output, _, _ = nn.fused_batch_norm(inputs,
                                                          gamma,
                                                          beta,
                                                          norm_mean,
                                                          norm_variance,
                                                          self._epsilon,
                                                          is_training=False)
                else:
                    bn_output = tf.nn.batch_normalization(
                        inputs, norm_mean, norm_variance, beta, gamma,
                        self._epsilon)

                if p.set_padded_output_to_zero:
                    bn_output *= 1.0 - paddings

            return bn_output
    def StreamStep(self, theta, inputs, paddings, state0):
        """Apply a single step of convolution to input_tensor.

    Only supports 1d causal convolution. Doesn't support dilation.

    Args:
      theta: A NestedMap of layer params.
      inputs: A Tensor of shape [b, t, 1, c]
      paddings: A 0/1 valued tensor of shape [b, t].
      state0: A NestedMap of tensors of the same struct as returned by
        zero_state().

    Returns:
      outputs: A Tensor of shape [b, t, 1, c * channel_multiplier]
      padding: the same as input paddings.
      state1: A NestedMap of the same struct as input state
    """
        p = self.params
        assert p.filter_shape[1] == 1, (
            'StreamStep only supports 1d causal convolution.')
        assert p.filter_stride[0] == 1, (
            'StreamStep doesn\'t support striding')
        assert p.dilation_rate == (1,
                                   1), ('StreamStep doesn\'t support dilation')

        with tf.name_scope(p.name):
            inputs = py_utils.HasShape(inputs, [-1, -1, 1, p.filter_shape[2]])
            paddings = py_utils.HasShape(paddings,
                                         py_utils.GetShape(inputs)[:2])
            q = py_utils.GetShape(paddings)[1]

            padded_inputs = py_utils.ApplyPadding(
                py_utils.AppendDims(paddings, 2), inputs)

            concat_inputs = tf.concat([state0.context, padded_inputs], axis=1)
            outputs = tf.nn.depthwise_conv2d(concat_inputs,
                                             self._GetWeight(theta),
                                             strides=(1, 1, 1, 1),
                                             dilations=(1, 1),
                                             data_format='NHWC',
                                             padding='VALID')
            if p.bias:
                outputs = tf.nn.bias_add(outputs, theta.b)
            new_context = concat_inputs[:, q:]
            return outputs, paddings, py_utils.NestedMap(context=new_context)
Example #21
0
    def _FProp(self, theta, inputs, paddings):
        p = self.params

        with tf.name_scope(p.name):
            inputs = self.fflayer_start.FProp(theta.fflayer_start, inputs,
                                              paddings)
            if p.layer_order == 'mhsa_before_conv':
                inputs, paddings = self._SelfAtten(theta, inputs, paddings)
                inputs, paddings = self._LConv(theta, inputs, paddings)
            else:
                assert p.layer_order == 'conv_before_mhsa'
                inputs, paddings = self._LConv(theta, inputs, paddings)
                inputs, paddings = self._SelfAtten(theta, inputs, paddings)
            inputs = self.fflayer_end.FProp(theta.fflayer_end, inputs,
                                            paddings)

            inputs = self.final_ln.FProp(theta.final_ln, inputs)
            return inputs, paddings
Example #22
0
 def FProp(self, theta, source_id, source_paddings, target_id,
           target_paddings, source_segment_id, target_segment_id, labels,
           label_weights, source_pos_id, target_pos_id):
     p = self.params
     with tf.name_scope(p.name):
         source_vecs = self.GetEmbeddings(
             theta.src_token_emb, self.src_token_emb, theta.src_pos_emb,
             self.src_pos_emb, theta.src_dropout, self.src_dropout,
             source_id, source_pos_id)
         target_vecs = None
         if p.add_tgt_embedding_layer:
             target_vecs = self.GetEmbeddings(
                 theta.tgt_token_emb, self.tgt_token_emb, theta.tgt_pos_emb,
                 self.tgt_pos_emb, theta.tgt_dropout, self.tgt_dropout,
                 target_id, target_pos_id)
         return (source_vecs, source_paddings, target_vecs, target_paddings,
                 source_segment_id, target_segment_id, labels,
                 label_weights, None, None)
Example #23
0
    def Value(self):
        p = self.params
        with tf.name_scope(p.name):
            steps = self._best_step
            best_step = steps[0]
            last_step = steps[1]

            ref_step = tf.maximum(self.theta.ref_step, best_step)
            f = self.theta.cur_factor

            # Decay if no improvement within window.
            new_factor = tf.where(last_step - ref_step < p.window, f,
                                  tf.maximum(p.min_factor, f * p.decay))
            # Update ref_step if we decayed.
            new_step = tf.where(tf.equal(new_factor, f), ref_step, last_step)
            update_step = tf.assign(self.vars.ref_step, new_step)
            with tf.control_dependencies([update_step]):
                return tf.assign(self.vars.cur_factor, new_factor)
Example #24
0
  def _unrolled_fprop(self, theta, *args):
    p = self.params
    fprop_inputs = args
    with tf.name_scope(p.name):
      for layer_idx in range(p.repeat):
        if p.per_layer_vars:
          layer_theta = theta['body_iter_%05d' % layer_idx]
        else:

          def _Slice(t, idx=layer_idx):
            return t[idx]

          layer_theta = tf.nest.map_structure(_Slice, theta.body)
        fprop_outputs = self._body.FProp(layer_theta, *fprop_inputs)
        fprop_outputs = _ToTuple(fprop_outputs)
        assert len(fprop_outputs) == len(fprop_inputs)
        fprop_inputs = fprop_outputs
      return fprop_outputs[0] if len(fprop_outputs) == 1 else fprop_outputs
    def _check_paddings(self, paddings):
        with tf.name_scope('check_paddings'):
            unpacked_paddings = tf.unstack(paddings)

            non_decr = []
            for t in unpacked_paddings:
                non_d = tf.is_non_decreasing(t)
                non_decr.append(non_d)
            all_non_decr = tf.stack(non_decr)

            paddings = py_utils.with_dependencies([
                tf.assert_equal(tf.reduce_any(tf.equal(paddings, 0.0)),
                                True,
                                message='must have at least one zero value.'),
                tf.assert_equal(
                    all_non_decr, True, message='must be non-decreasing')
            ], paddings)
            return paddings
Example #26
0
 def Inference(self):
     with tf.name_scope('inference'):
         feed1 = tf.placeholder(name='feed1_node',
                                dtype=tf.float32,
                                shape=[1])
         fetch1 = tf.identity(feed1, name='fetch1_node')
         feed2 = tf.placeholder(name='feed2_node',
                                dtype=tf.float32,
                                shape=[2])
         fetch2 = tf.identity(feed2, name='fetch2_node')
         inference_graph = inference_graph_pb2.InferenceGraph()
         subgraph = inference_graph.subgraphs['default']
         subgraph.feeds['feed1'] = feed1.name
         subgraph.fetches['fetch1'] = fetch1.name
         subgraph = inference_graph.subgraphs['subgraph2']
         subgraph.feeds['feed1'] = feed2.name
         subgraph.fetches['fetch1'] = fetch2.name
         return inference_graph
Example #27
0
    def Apply(self, lr, var_grad):
        """Applies the gradient to the variable.

    Args:
      lr: A scalar or callable that returns the base learning rate.
      var_grad: A `.NestedMap` of (var, grad) pairs.

    Returns:
      The variable update op.

    Raises:
      RuntimeError: When `lr` is not a callable in Eager mode.
    """

        # In Graph mode, always re-create the optimizer to remain consistent with
        # the old logic for the Graph trainer.
        # TODO(jiaweix): Recreating optimizers in Graph mode seems unnecessary.
        if self._optimizer is None or not py_utils.IsEagerMode():
            self._optimizer = self.GetOptimizer(lr)

        def _Apply():
            return self._optimizer.apply_gradients(
                [(g, v) for (v, g) in var_grad.Flatten()],
                name='meta_backprop')

        clear_variable_scope = self.params.clear_variable_scope
        if clear_variable_scope is None:
            clear_variable_scope = not py_utils.IsEagerMode()
        if clear_variable_scope:
            # Many optimizers, e.g., Adam, Adagrad, etc., create
            # variables. We need to ensure name scope and variable scope are
            # cleared. Otherwise, tpu.batch_parallel does not work.
            with tf.name_scope(None):
                with tf.variable_scope(
                        tf.VariableScope(use_resource=True,
                                         reuse=self.VarReuseForSlotVars())):
                    var_update_op = _Apply()
        else:
            var_update_op = _Apply()

        if self.params.add_summary_in_apply:
            lr_value = GetLrValue(lr)
            self.AddSummary(lr_value, self._optimizer, var_grad)
        return var_update_op
Example #28
0
    def FProp(self, theta, inputs, paddings):
        """Builds FProp graph.

    Args:
      theta: A NestedMap of Tensors, see base class.
      inputs: A Tensor of shape [batch, seqlen, dim0].
      paddings: A Tensor of shape [batch, seqlen].

    Returns:
      output: A Tensor of shape [batch, seqlen, dim0].
      out_paddings: A Tensor of shape [batch, seqlen].
    """

        p = self.params
        with tf.name_scope(p.name):
            unnormalized_inputs = inputs

            inputs = self.ln.FProp(theta.ln, inputs)
            if p.split_act_gated_linear_start:
                act_inputs = self.linear_start_act.FProp(
                    theta.linear_start_act, inputs)
                gated_inputs = self.linear_start_gated.FProp(
                    theta.linear_start_gated, inputs)
            else:
                inputs = self.linear_start.FProp(theta.linear_start, inputs)
                gated_inputs, act_inputs = tf.split(inputs, 2, axis=-1)
            inputs = self._GLU(gated_inputs, act_inputs)

            # TODO(jamesqin): inroduce depthwise conv2d with 3d inputs.
            # [b, t, d] --> [b, t, 1, d]
            inputs = tf.expand_dims(inputs, 2)
            theta.depthwise_conv1d.w = moe_layers.Split(
                theta.depthwise_conv1d.w, 2, p.xla_num_partitions)
            inputs, paddings = self.depthwise_conv1d.FProp(
                theta.depthwise_conv1d, inputs, paddings)
            inputs = self._Normalize(theta, inputs, paddings)

            inputs = self._ApplyActivation(inputs, p.conv_activation)

            inputs = self.linear_end.FProp(theta.linear_end, inputs)
            inputs = self.dropout.FProp(theta.dropout, inputs)

            output = inputs + unnormalized_inputs
            return output, paddings
Example #29
0
            def TpuEvalStep(*args):
                """Eval a shard of a batch on a single TPU core.

        Args:
          *args: metrics values from previous steps.

        Returns:
          Summed eval metrics.
        """
                with tf.name_scope('tpu_eval'):
                    with py_utils.OpportunisticVariableReuseScope(True):
                        self._model.InstantiateVariables()
                        self._model.ConstructFPropGraph()
                    per_step_eval_metrics = self._eval_metrics.SetMetrics(
                        self._task.eval_metrics, args)
                    summed_metrics = []
                    for x, y in zip(per_step_eval_metrics, args):
                        summed_metrics.append(x + y)
                    return summed_metrics
Example #30
0
 def FProp(self, theta, *args):
   p = self.params
   with tf.name_scope(p.name):
     tf.logging.vlog(1, 'layer %s', self.params.name)
     if p.repeat <= 1:
       for (name, ch) in self._seq:
         th = theta[name]
         args = _ToTuple(args)
         tf.logging.vlog(1, 'SequentialLayer: call %s %s %d %s',
                         ch.params.name, ch, len(args), str(args))
         args = ch.FProp(th, *args)
     else:
       for (ch, th) in zip(self.rep, theta.rep):
         args = _ToTuple(args)
         tf.logging.vlog(1, '  call %s %s %d %s', ch.params.name, ch,
                         len(args), str(args))
         args = ch.FProp(th, *args)
     args = _ToTuple(args)
     return args[0] if len(args) == 1 else args