Ejemplo n.º 1
0
    def __init__(self, master, component_spec, attr_defaults=None):
        """Initializes the ComponentBuilder from specifications.

    Args:
      master: dragnn.MasterBuilder object.
      component_spec: dragnn.ComponentSpec proto to be built.
      attr_defaults: Optional dict of component attribute defaults.  If not
          provided or if empty, attributes are not extracted.
    """
        self.master = master
        self.num_actions = component_spec.num_actions
        self.name = component_spec.name
        self.spec = component_spec
        self.moving_average = None

        # Determine if this component should apply self-normalization.
        self.eligible_for_self_norm = (
            not self.master.hyperparams.self_norm_components_filter
            or self.name
            in self.master.hyperparams.self_norm_components_filter.split(','))

        # Extract component attributes before make_network(), so the network unit
        # can access them.
        self._attrs = {}
        global_attr_defaults = {
            'locally_normalize': False,
            'output_as_probabilities': False
        }
        if attr_defaults:
            global_attr_defaults.update(attr_defaults)
        self._attrs = network_units.get_attrs_with_defaults(
            self.spec.component_builder.parameters, global_attr_defaults)
        do_local_norm = self._attrs['locally_normalize']
        self._output_as_probabilities = self._attrs['output_as_probabilities']
        with tf.variable_scope(self.name):
            self.training_beam_size = tf.constant(self.spec.training_beam_size,
                                                  name='TrainingBeamSize')
            self.inference_beam_size = tf.constant(
                self.spec.inference_beam_size, name='InferenceBeamSize')
            self.locally_normalize = tf.constant(do_local_norm,
                                                 name='LocallyNormalize')
            self._step = tf.get_variable('step', [],
                                         initializer=tf.zeros_initializer(),
                                         dtype=tf.int32)
            self._total = tf.get_variable('total', [],
                                          initializer=tf.zeros_initializer(),
                                          dtype=tf.int32)

        # Construct network variables.
        self.network = self.make_network(self.spec.network_unit)

        # Construct moving average.
        if self.master.hyperparams.use_moving_average:
            self.moving_average = tf.train.ExponentialMovingAverage(
                decay=self.master.hyperparams.average_weight,
                num_updates=self._step)
            self.avg_ops = [self.moving_average.apply(self.network.params)]

        # Used to export the cell; see add_cell_input() and add_cell_output().
        self._cell_subgraph_spec = export_pb2.CellSubgraphSpec()
Ejemplo n.º 2
0
    def __init__(self, component):
        """Initializes layers.

    Args:
      component: Parent ComponentBuilderBase object.
    """
        layers = [
            network_units.Layer(self, 'lengths', -1),
            network_units.Layer(self, 'scores', -1),
            network_units.Layer(self, 'logits', -1),
            network_units.Layer(self, 'arcs', -1),
        ]
        super(MstSolverNetwork, self).__init__(component, init_layers=layers)

        self._attrs = network_units.get_attrs_with_defaults(
            component.spec.network_unit.parameters,
            defaults={
                'forest': False,
                'loss': 'softmax',
                'crf_max_dynamic_range': 20,
            })

        check.Eq(len(self._fixed_feature_dims.items()), 0,
                 'Expected no fixed features')
        check.Eq(len(self._linked_feature_dims.items()), 2,
                 'Expected two linked features')

        check.In('lengths', self._linked_feature_dims,
                 'Missing required linked feature')
        check.In('scores', self._linked_feature_dims,
                 'Missing required linked feature')
Ejemplo n.º 3
0
  def __init__(self, component):
    """Initializes layers.

    Args:
      component: Parent ComponentBuilderBase object.
    """
    layers = [
        network_units.Layer(self, 'lengths', -1),
        network_units.Layer(self, 'scores', -1),
        network_units.Layer(self, 'logits', -1),
        network_units.Layer(self, 'arcs', -1),
    ]
    super(MstSolverNetwork, self).__init__(component, init_layers=layers)

    self._attrs = network_units.get_attrs_with_defaults(
        component.spec.network_unit.parameters,
        defaults={
            'forest': False,
            'loss': 'softmax',
            'crf_max_dynamic_range': 20,
        })

    check.Eq(
        len(self._fixed_feature_dims.items()), 0, 'Expected no fixed features')
    check.Eq(
        len(self._linked_feature_dims.items()), 2,
        'Expected two linked features')

    check.In('lengths', self._linked_feature_dims,
             'Missing required linked feature')
    check.In('scores', self._linked_feature_dims,
             'Missing required linked feature')
Ejemplo n.º 4
0
    def __init__(self, master, component_spec, attr_defaults=None):
        """Initializes the ComponentBuilder from specifications.

    Args:
      master: dragnn.MasterBuilder object.
      component_spec: dragnn.ComponentSpec proto to be built.
      attr_defaults: Optional dict of component attribute defaults.  If not
          provided or if empty, attributes are not extracted.
    """
        self.master = master
        self.num_actions = component_spec.num_actions
        self.name = component_spec.name
        self.spec = component_spec
        self.moving_average = None

        # Determine if this component should apply self-normalization.
        self.eligible_for_self_norm = (
            not self.master.hyperparams.self_norm_components_filter
            or self.name
            in self.master.hyperparams.self_norm_components_filter.split(','))

        # Extract component attributes before make_network(), so the network unit
        # can access them.
        self._attrs = {}
        if attr_defaults:
            self._attrs = network_units.get_attrs_with_defaults(
                self.spec.component_builder.parameters, attr_defaults)

        with tf.variable_scope(self.name):
            self.locally_normalize = tf.constant(False,
                                                 name='LocallyNormalize')
            self._step = tf.get_variable('step', [],
                                         initializer=tf.zeros_initializer(),
                                         dtype=tf.int32)
            self._total = tf.get_variable('total', [],
                                          initializer=tf.zeros_initializer(),
                                          dtype=tf.int32)

        # Construct network variables.
        self.network = self.make_network(self.spec.network_unit)

        # Construct moving average.
        if self.master.hyperparams.use_moving_average:
            self.moving_average = tf.train.ExponentialMovingAverage(
                decay=self.master.hyperparams.average_weight,
                num_updates=self._step)
            self.avg_ops = [self.moving_average.apply(self.network.params)]
Ejemplo n.º 5
0
  def __init__(self, master, component_spec, attr_defaults=None):
    """Initializes the ComponentBuilder from specifications.

    Args:
      master: dragnn.MasterBuilder object.
      component_spec: dragnn.ComponentSpec proto to be built.
      attr_defaults: Optional dict of component attribute defaults.  If not
          provided or if empty, attributes are not extracted.
    """
    self.master = master
    self.num_actions = component_spec.num_actions
    self.name = component_spec.name
    self.spec = component_spec
    self.moving_average = None

    # Determine if this component should apply self-normalization.
    self.eligible_for_self_norm = (
        not self.master.hyperparams.self_norm_components_filter or self.name in
        self.master.hyperparams.self_norm_components_filter.split(','))

    # Extract component attributes before make_network(), so the network unit
    # can access them.
    self._attrs = {}
    if attr_defaults:
      self._attrs = network_units.get_attrs_with_defaults(
          self.spec.component_builder.parameters, attr_defaults)

    with tf.variable_scope(self.name):
      self.training_beam_size = tf.constant(
          self.spec.training_beam_size, name='TrainingBeamSize')
      self.inference_beam_size = tf.constant(
          self.spec.inference_beam_size, name='InferenceBeamSize')
      self.locally_normalize = tf.constant(False, name='LocallyNormalize')
      self._step = tf.get_variable(
          'step', [], initializer=tf.zeros_initializer(), dtype=tf.int32)
      self._total = tf.get_variable(
          'total', [], initializer=tf.zeros_initializer(), dtype=tf.int32)

    # Construct network variables.
    self.network = self.make_network(self.spec.network_unit)

    # Construct moving average.
    if self.master.hyperparams.use_moving_average:
      self.moving_average = tf.train.ExponentialMovingAverage(
          decay=self.master.hyperparams.average_weight, num_updates=self._step)
      self.avg_ops = [self.moving_average.apply(self.network.params)]
Ejemplo n.º 6
0
    def __init__(self, component, additional_attr_defaults=None):
        """Initializes the LSTM base class.

    Parameters used:
      hidden_layer_sizes: Comma-delimited number of hidden units for each layer.
      input_dropout_rate (-1.0): Input dropout rate for each layer.  If < 0.0,
          use the global |dropout_rate| hyperparameter.
      recurrent_dropout_rate (0.8): Recurrent dropout rate.  If < 0.0, use the
          global |recurrent_dropout_rate| hyperparameter.
      layer_norm (True): Whether or not to use layer norm.

    Hyperparameters used:
      dropout_rate: Input dropout rate.
      recurrent_dropout_rate: Recurrent dropout rate.

    Args:
      component: parent ComponentBuilderBase object.
      additional_attr_defaults: Additional attributes for use by derived class.
    """
        attr_defaults = additional_attr_defaults or {}
        attr_defaults.update({
            'layer_norm': True,
            'input_dropout_rate': -1.0,
            'recurrent_dropout_rate': 0.8,
            'hidden_layer_sizes': '256',
        })
        self._attrs = dragnn.get_attrs_with_defaults(
            component.spec.network_unit.parameters, defaults=attr_defaults)

        self._hidden_layer_sizes = map(
            int, self._attrs['hidden_layer_sizes'].split(','))

        self._input_dropout_rate = self._attrs['input_dropout_rate']
        if self._input_dropout_rate < 0.0:
            self._input_dropout_rate = component.master.hyperparams.dropout_rate

        self._recurrent_dropout_rate = self._attrs['recurrent_dropout_rate']
        if self._recurrent_dropout_rate < 0.0:
            self._recurrent_dropout_rate = (
                component.master.hyperparams.recurrent_dropout_rate)
        if self._recurrent_dropout_rate < 0.0:
            self._recurrent_dropout_rate = component.master.hyperparams.dropout_rate

        tf.logging.info('[%s] input_dropout_rate=%s recurrent_dropout_rate=%s',
                        component.name, self._input_dropout_rate,
                        self._recurrent_dropout_rate)

        layers, context_layers = self.create_hidden_layers(
            component, self._hidden_layer_sizes)
        last_layer_dim = layers[-1].dim
        layers.append(
            dragnn.Layer(component, name='last_layer', dim=last_layer_dim))
        layers.append(
            dragnn.Layer(component, name='logits', dim=component.num_actions))

        # Provide initial layers and context layers, so the base class constructor
        # can safely use accessors like get_layer_size().
        super(BaseLSTMNetwork,
              self).__init__(component,
                             init_layers=layers,
                             init_context_layers=context_layers)

        # Allocate parameters for the softmax.
        self._params.append(
            tf.get_variable(
                'weights_softmax', [last_layer_dim, component.num_actions],
                initializer=tf.random_normal_initializer(stddev=1e-4)))
        self._params.append(
            tf.get_variable('bias_softmax', [component.num_actions],
                            initializer=tf.zeros_initializer()))
Ejemplo n.º 7
0
  def __init__(self, component, additional_attr_defaults=None):
    """Initializes the LSTM base class.

    Parameters used:
      hidden_layer_sizes: Comma-delimited number of hidden units for each layer.
      input_dropout_rate (-1.0): Input dropout rate for each layer.  If < 0.0,
          use the global |dropout_rate| hyperparameter.
      recurrent_dropout_rate (0.8): Recurrent dropout rate.  If < 0.0, use the
          global |recurrent_dropout_rate| hyperparameter.
      layer_norm (True): Whether or not to use layer norm.

    Hyperparameters used:
      dropout_rate: Input dropout rate.
      recurrent_dropout_rate: Recurrent dropout rate.

    Args:
      component: parent ComponentBuilderBase object.
      additional_attr_defaults: Additional attributes for use by derived class.
    """
    attr_defaults = additional_attr_defaults or {}
    attr_defaults.update({
        'layer_norm': True,
        'input_dropout_rate': -1.0,
        'recurrent_dropout_rate': 0.8,
        'hidden_layer_sizes': '256',
    })
    self._attrs = dragnn.get_attrs_with_defaults(
        component.spec.network_unit.parameters,
        defaults=attr_defaults)

    self._hidden_layer_sizes = map(int,
                                   self._attrs['hidden_layer_sizes'].split(','))

    self._input_dropout_rate = self._attrs['input_dropout_rate']
    if self._input_dropout_rate < 0.0:
      self._input_dropout_rate = component.master.hyperparams.dropout_rate

    self._recurrent_dropout_rate = self._attrs['recurrent_dropout_rate']
    if self._recurrent_dropout_rate < 0.0:
      self._recurrent_dropout_rate = (
          component.master.hyperparams.recurrent_dropout_rate)
    if self._recurrent_dropout_rate < 0.0:
      self._recurrent_dropout_rate = component.master.hyperparams.dropout_rate

    tf.logging.info('[%s] input_dropout_rate=%s recurrent_dropout_rate=%s',
                    component.name, self._input_dropout_rate,
                    self._recurrent_dropout_rate)

    layers, context_layers = self.create_hidden_layers(component,
                                                       self._hidden_layer_sizes)
    last_layer_dim = layers[-1].dim
    layers.append(
        dragnn.Layer(component, name='last_layer', dim=last_layer_dim))
    layers.append(
        dragnn.Layer(component, name='logits', dim=component.num_actions))

    # Provide initial layers and context layers, so the base class constructor
    # can safely use accessors like get_layer_size().
    super(BaseLSTMNetwork, self).__init__(
        component, init_layers=layers, init_context_layers=context_layers)

    # Allocate parameters for the softmax.
    self._params.append(
        tf.get_variable(
            'weights_softmax', [last_layer_dim, component.num_actions],
            initializer=tf.random_normal_initializer(stddev=1e-4)))
    self._params.append(
        tf.get_variable(
            'bias_softmax', [component.num_actions],
            initializer=tf.zeros_initializer()))
Ejemplo n.º 8
0
 def MakeAttrs(self, defaults, key=None, value=None):
     """Returns attrs based on the |defaults| and one |key|,|value| override."""
     spec = spec_pb2.RegisteredModuleSpec()
     if key and value:
         spec.parameters[key] = value
     return network_units.get_attrs_with_defaults(spec.parameters, defaults)
Ejemplo n.º 9
0
  def __init__(self, component):
    """Initializes parameters for this Transformer unit.

    Args:
      component: parent ComponentBuilderBase object.

    Parameters used to construct the network:
      num_layers: number of transformer layers (attention + MLP)
      hidden_size: size of hidden layers in MLPs
      filter_size: filter width for each attention head
      num_heads: number of attention heads
      residual_dropout: dropout keep rate for residual layers
      attention_dropout: dropout keep rate for attention weights
      mlp_dropout: dropout keep rate for mlp layers
      initialization: initialization scheme to use for model parameters
      bias_init: initial value for bias parameters
      scale_attention: whether to scale attention parameters by filter_size^-0.5
      layer_norm_residuals: whether to perform layer normalization on residual
        layers
      timing_signal: whether to add a position-wise timing signal to the input
      kernel: kernel width in middle MLP layers
      mlp_layers: number of MLP layers. Must be >= 2.

    Raises:
      ValueError: if mlp_layers < 2.

    The input depth of the first layer is inferred from the total concatenated
    size of the input features, minus 1 to account for the sequence lengths.

    Hyperparameters used:
      dropout_rate: The probability that an input is not dropped. This is the
          default when the |dropout_keep_prob| parameter is unset.
    """

    super(TransformerEncoderNetwork, self).__init__(component)
    default_dropout_rate = component.master.hyperparams.dropout_rate
    self._attrs = network_units.get_attrs_with_defaults(
        component.spec.network_unit.parameters, defaults={
            'num_layers': 4,
            'hidden_size': 256,
            'filter_size': 64,
            'num_heads': 8,
            'residual_drop': default_dropout_rate,
            'attention_drop': default_dropout_rate,
            'mlp_drop': default_dropout_rate,
            'initialization': 'xavier',
            'bias_init': 0.001,
            'scale_attention': True,
            'layer_norm_residuals': True,
            'timing_signal': True,
            'kernel': 1,
            'mlp_layers': 2})

    self._num_layers = self._attrs['num_layers']
    self._hidden_size = self._attrs['hidden_size']
    self._filter_size = self._attrs['filter_size']
    self._num_heads = self._attrs['num_heads']
    self._residual_dropout = self._attrs['residual_drop']
    self._attention_dropout = self._attrs['attention_drop']
    self._mlp_dropout = self._attrs['mlp_drop']
    self._initialization = self._attrs['initialization']
    self._bias_init = self._attrs['bias_init']
    self._scale_attn = self._attrs['scale_attention']
    self._layer_norm_res = self._attrs['layer_norm_residuals']
    self._timing_signal = self._attrs['timing_signal']
    self._kernel = self._attrs['kernel']
    self._mlp_depth = self._attrs['mlp_layers']

    if self._mlp_depth < 2:
      raise ValueError('TransformerEncoderNetwork needs mlp_layers >= 2')

    self._combined_filters = self._num_heads * self._filter_size

    self._weights = []
    self._biases = []
    self._layer_norms = {}

    # Hacky: one dimension comes from the lengths input; subtract it.
    self._concatenated_input_dim -= 1

    # Initial projection of inputs, this is mainly to project input down to the
    # right size for residual layers
    proj_shape = [1, 1, self._concatenated_input_dim, self._combined_filters]
    self._weights.append(
        network_units.add_var_initialized('init_proj', proj_shape,
                                          self._initialization))
    self._biases.append(tf.get_variable('init_bias',
                                        self._combined_filters,
                                        initializer=tf.constant_initializer(
                                            self._bias_init),
                                        dtype=tf.float32))

    for i in range(self._num_layers):
      with tf.variable_scope('transform_%d' % i):
        # Attention weights: 3 * self.combined_filters = (q, k, v)
        # We assume that q, k and v all have the same dimension
        attn_shape = [1, 1, self._combined_filters, 3 * self._combined_filters]
        self._weights.append(
            network_units.add_var_initialized('attn_weights',
                                              attn_shape,
                                              self._initialization))

        # Attention final projection weights
        proj_shape = [1, 1, self._combined_filters, self._combined_filters]
        self._weights.append(
            network_units.add_var_initialized('proj_weights',
                                              proj_shape,
                                              self._initialization))

        # MLP weights
        with tf.variable_scope('mlp'):
          ff_shape = [1, 1, self._combined_filters, self._hidden_size]
          self._weights.append(
              network_units.add_var_initialized('ff_weights_0',
                                                ff_shape,
                                                self._initialization))
          ff_shape = [1, self._kernel, self._hidden_size, self._hidden_size]
          for j in range(1, self._mlp_depth - 1):
            self._weights.append(
                network_units.add_var_initialized('ff_weights_%d' % j,
                                                  ff_shape,
                                                  self._initialization))
          ff_shape = [1, 1, self._hidden_size, self._combined_filters]
          self._weights.append(
              network_units.add_var_initialized('ff_weights_%d' %
                                                (self._mlp_depth - 1),
                                                ff_shape,
                                                self._initialization))

        # Layer normalization for residual layers
        if self._layer_norm_res:
          attn_layer_norm = network_units.LayerNorm(component,
                                                    'attn_layer_norm_%d' % i,
                                                    self._combined_filters,
                                                    tf.float32)
          self._layer_norms['attn_layer_norm_%d' % i] = attn_layer_norm

          ff_layer_norm = network_units.LayerNorm(component,
                                                  'ff_layer_norm_%d' % i,
                                                  self._combined_filters,
                                                  tf.float32)
          self._layer_norms['ff_layer_norm_%d' % i] = ff_layer_norm

          # Layer norm parameters are not added to self._weights,
          # which means that they are not l2 regularized
          self._params.extend(attn_layer_norm.params + ff_layer_norm.params)

    self._params.extend(self._weights)
    self._params.extend(self._biases)
    self._regularized_weights.extend(self._weights)
    self._layers.append(
        network_units.Layer(component, name='transformer_output',
                            dim=self._combined_filters))
Ejemplo n.º 10
0
 def MakeAttrs(self, defaults, key=None, value=None):
   """Returns attrs based on the |defaults| and one |key|,|value| override."""
   spec = spec_pb2.RegisteredModuleSpec()
   if key and value:
     spec.parameters[key] = value
   return network_units.get_attrs_with_defaults(spec.parameters, defaults)
Ejemplo n.º 11
0
  def __init__(self, component):
    """Initializes parameters for this Transformer unit.

    Args:
      component: parent ComponentBuilderBase object.

    Parameters used to construct the network:
      num_layers: number of transformer layers (attention + MLP)
      hidden_size: size of hidden layers in MLPs
      filter_size: filter width for each attention head
      num_heads: number of attention heads
      residual_dropout: dropout keep rate for residual layers
      attention_dropout: dropout keep rate for attention weights
      mlp_dropout: dropout keep rate for mlp layers
      initialization: initialization scheme to use for model parameters
      bias_init: initial value for bias parameters
      scale_attention: whether to scale attention parameters by filter_size^-0.5
      layer_norm_residuals: whether to perform layer normalization on residual
        layers
      timing_signal: whether to add a position-wise timing signal to the input
      kernel: kernel width in middle MLP layers
      mlp_layers: number of MLP layers. Must be >= 2.

    Raises:
      ValueError: if mlp_layers < 2.

    The input depth of the first layer is inferred from the total concatenated
    size of the input features, minus 1 to account for the sequence lengths.

    Hyperparameters used:
      dropout_rate: The probability that an input is not dropped. This is the
          default when the |dropout_keep_prob| parameter is unset.
    """

    super(TransformerEncoderNetwork, self).__init__(component)
    default_dropout_rate = component.master.hyperparams.dropout_rate
    self._attrs = network_units.get_attrs_with_defaults(
        component.spec.network_unit.parameters, defaults={
            'num_layers': 4,
            'hidden_size': 256,
            'filter_size': 64,
            'num_heads': 8,
            'residual_drop': default_dropout_rate,
            'attention_drop': default_dropout_rate,
            'mlp_drop': default_dropout_rate,
            'initialization': 'xavier',
            'bias_init': 0.001,
            'scale_attention': True,
            'layer_norm_residuals': True,
            'timing_signal': True,
            'kernel': 1,
            'mlp_layers': 2})

    self._num_layers = self._attrs['num_layers']
    self._hidden_size = self._attrs['hidden_size']
    self._filter_size = self._attrs['filter_size']
    self._num_heads = self._attrs['num_heads']
    self._residual_dropout = self._attrs['residual_drop']
    self._attention_dropout = self._attrs['attention_drop']
    self._mlp_dropout = self._attrs['mlp_drop']
    self._initialization = self._attrs['initialization']
    self._bias_init = self._attrs['bias_init']
    self._scale_attn = self._attrs['scale_attention']
    self._layer_norm_res = self._attrs['layer_norm_residuals']
    self._timing_signal = self._attrs['timing_signal']
    self._kernel = self._attrs['kernel']
    self._mlp_depth = self._attrs['mlp_layers']

    if self._mlp_depth < 2:
      raise ValueError('TransformerEncoderNetwork needs mlp_layers >= 2')

    self._combined_filters = self._num_heads * self._filter_size

    self._weights = []
    self._biases = []
    self._layer_norms = {}

    # Hacky: one dimension comes from the lengths input; subtract it.
    self._concatenated_input_dim -= 1

    # Initial projection of inputs, this is mainly to project input down to the
    # right size for residual layers
    proj_shape = [1, 1, self._concatenated_input_dim, self._combined_filters]
    self._weights.append(
        network_units.add_var_initialized('init_proj', proj_shape,
                                          self._initialization))
    self._biases.append(tf.get_variable('init_bias',
                                        self._combined_filters,
                                        initializer=tf.constant_initializer(
                                            self._bias_init),
                                        dtype=tf.float32))

    for i in range(self._num_layers):
      with tf.variable_scope('transform_%d' % i):
        # Attention weights: 3 * self.combined_filters = (q, k, v)
        # We assume that q, k and v all have the same dimension
        attn_shape = [1, 1, self._combined_filters, 3 * self._combined_filters]
        self._weights.append(
            network_units.add_var_initialized('attn_weights',
                                              attn_shape,
                                              self._initialization))

        # Attention final projection weights
        proj_shape = [1, 1, self._combined_filters, self._combined_filters]
        self._weights.append(
            network_units.add_var_initialized('proj_weights',
                                              proj_shape,
                                              self._initialization))

        # MLP weights
        with tf.variable_scope('mlp'):
          ff_shape = [1, 1, self._combined_filters, self._hidden_size]
          self._weights.append(
              network_units.add_var_initialized('ff_weights_0',
                                                ff_shape,
                                                self._initialization))
          ff_shape = [1, self._kernel, self._hidden_size, self._hidden_size]
          for j in range(1, self._mlp_depth - 1):
            self._weights.append(
                network_units.add_var_initialized('ff_weights_%d' % j,
                                                  ff_shape,
                                                  self._initialization))
          ff_shape = [1, 1, self._hidden_size, self._combined_filters]
          self._weights.append(
              network_units.add_var_initialized('ff_weights_%d' %
                                                (self._mlp_depth - 1),
                                                ff_shape,
                                                self._initialization))

        # Layer normalization for residual layers
        if self._layer_norm_res:
          attn_layer_norm = network_units.LayerNorm(component,
                                                    'attn_layer_norm_%d' % i,
                                                    self._combined_filters,
                                                    tf.float32)
          self._layer_norms['attn_layer_norm_%d' % i] = attn_layer_norm

          ff_layer_norm = network_units.LayerNorm(component,
                                                  'ff_layer_norm_%d' % i,
                                                  self._combined_filters,
                                                  tf.float32)
          self._layer_norms['ff_layer_norm_%d' % i] = ff_layer_norm

          # Layer norm parameters are not added to self._weights,
          # which means that they are not l2 regularized
          self._params.extend(attn_layer_norm.params + ff_layer_norm.params)

    self._params.extend(self._weights)
    self._params.extend(self._biases)
    self._regularized_weights.extend(self._weights)
    self._layers.append(
        network_units.Layer(component, name='transformer_output',
                            dim=self._combined_filters))