Exemple #1
0
def CausalAttention(d_feature, n_heads=1,
                    d_attention_key=None, d_attention_value=None,
                    attention_type=DotProductCausalAttention, mode='train'):
  """Transformer-style multi-headed causal attention.

  Args:
    d_feature: int:  dimensionality of feature embedding
    n_heads: int: number of attention heads
    d_attention_key: int: depth of key vector for each attention head
        (default is d_feature // n_heads)
    d_attention_value: int: depth of value vector for each attention head
        (default is d_feature // n_heads)
    attention_type: subclass of tl.BaseCausalAttention: attention class to use
    mode: str: 'train' or 'eval'

  Returns:
    Multi-headed self-attention result.
  """
  if d_attention_key is None:
    assert d_feature % n_heads == 0
    d_attention_key = d_feature // n_heads
  if d_attention_value is None:
    assert d_feature % n_heads == 0
    d_attention_value = d_feature // n_heads

  return [
      cb.Dup(), cb.Dup(),
      cb.Parallel(
          ComputeAttentionHeads(n_heads=n_heads, d_head=d_attention_key),
          ComputeAttentionHeads(n_heads=n_heads, d_head=d_attention_key),
          ComputeAttentionHeads(n_heads=n_heads, d_head=d_attention_value),
      ),
      attention_type(mode=mode),
      ComputeAttentionOutput(n_heads=n_heads, d_model=d_feature),
  ]
Exemple #2
0
def Attention(d_feature, n_heads=1, dropout=0.0, mode='train'):
  """Transformer-style multi-headed attention.

  Accepts inputs of the form (x, mask) and constructs (q, k, v) from x.

  Args:
    d_feature: int:  dimensionality of feature embedding
    n_heads: int: number of attention heads
    dropout: float: dropout rate
    mode: str: 'train' or 'eval'

  Returns:
    Multi-headed self-attention result and the mask.
  """
  return [
      cb.Dup(), cb.Dup(),
      AttentionQKV(d_feature, n_heads=n_heads, dropout=dropout, mode=mode),
  ]
Exemple #3
0
    def test_dup(self):
        layer = cb.Dup()
        input_shape = ((3, 2), )
        expected_shape = ((3, 2), (3, 2))
        output_shape = base.check_shape_agreement(layer, input_shape)
        self.assertEqual(output_shape, expected_shape)

        input_shape = ((3, 2), ) + _REST_OF_STACK
        expected_shape = ((3, 2), (3, 2)) + _REST_OF_STACK
        output_shape = base.check_shape_agreement(layer, input_shape)
        self.assertEqual(output_shape, expected_shape)
Exemple #4
0
def MultiHeadedAttention(
    d_feature, n_heads=8, dropout=0.0, mode='train'):
  """Transformer-style multi-headed attention.

  Accepts inputs of the form (x, mask) and constructs (q, k, v) from x.

  Args:
    d_feature: int:  dimensionality of feature embedding
    n_heads: int: number of attention heads
    dropout: float: dropout rate
    mode: str: 'train' or 'eval'

  Returns:
    Multi-headed self-attention layer.
  """
  return [
      combinators.Dup(),
      combinators.Dup(),
      MultiHeadedAttentionQKV(  # pylint: disable=no-value-for-parameter
          d_feature, n_heads=n_heads, dropout=dropout, mode=mode),
  ]
Exemple #5
0
def CausalAttention(d_feature, n_heads=1, dropout=0.0, mode='train'):
    """Transformer-style multi-headed causal attention.

  # TODO(jonni,lukaszkaiser): standardize and improve layer comments.
  Accepts inputs of the form x and constructs (q, k, v) and causal mask from x.

  Args:
    d_feature: int:  dimensionality of feature embedding
    n_heads: int: number of attention heads
    dropout: float: dropout rate
    mode: str: 'train' or 'eval'

  Returns:
    Multi-headed self-attention result.
  """
    return [
        cb.Dup(),
        cb.Parallel([], CausalMask(axis=-2)),  # pylint: disable=no-value-for-parameter
        Attention(d_feature, n_heads=n_heads, dropout=dropout, mode=mode),
        cb.Parallel([], cb.Drop()),  # x
    ]
def MaskedScalar(metric_layer, mask_id=None, has_weights=False):
    """Metric as scalar compatible with Trax masking."""
    # Stack of (inputs, targets) --> (metric, weight-mask).
    metric_and_mask = [
        cb.Parallel(
            [],
            cb.Dup()  # Duplicate targets
        ),
        cb.Parallel(
            metric_layer,  # Metric: (inputs, targets) --> metric
            WeightMask(mask_id=mask_id)  # pylint: disable=no-value-for-parameter
        )
    ]
    if not has_weights:
        # Take (metric, weight-mask) and return the weighted mean.
        return cb.Serial([metric_and_mask, WeightedMean()])  # pylint: disable=no-value-for-parameter
    return cb.Serial([
        metric_and_mask,
        cb.Parallel(
            [],
            cb.Multiply()  # Multiply given weights by mask_id weights
        ),
        WeightedMean()  # pylint: disable=no-value-for-parameter
    ])
Exemple #7
0
def GeneralGRUCell(candidate_transform,
                   memory_transform_fn=None,
                   gate_nonlinearity=core.Sigmoid,
                   candidate_nonlinearity=core.Tanh,
                   dropout_rate_c=0.1,
                   sigmoid_bias=0.5):
    r"""Parametrized Gated Recurrent Unit (GRU) cell construction.

  GRU update equations:
  $$ Update gate: u_t = \sigmoid(U' * s_{t-1} + B') $$
  $$ Reset gate: r_t = \sigmoid(U'' * s_{t-1} + B'') $$
  $$ Candidate memory: c_t = \tanh(U * (r_t \odot s_{t-1}) + B) $$
  $$ New State: s_t = u_t \odot s_{t-1} + (1 - u_t) \odot c_t $$

  See combinators.Gate for details on the gating function.


  Args:
    candidate_transform: Transform to apply inside the Candidate branch. Applied
      before nonlinearities.
    memory_transform_fn: Optional transformation on the memory before gating.
    gate_nonlinearity: Function to use as gate activation. Allows trying
      alternatives to Sigmoid, such as HardSigmoid.
    candidate_nonlinearity: Nonlinearity to apply after candidate branch. Allows
      trying alternatives to traditional Tanh, such as HardTanh
    dropout_rate_c: Amount of dropout on the transform (c) gate. Dropout works
      best in a GRU when applied exclusively to this branch.
    sigmoid_bias: Constant to add before sigmoid gates. Generally want to start
      off with a positive bias.

  Returns:
    A model representing a GRU cell with specified transforms.
  """
    gate_block = [  # u_t
        candidate_transform(),
        core.AddConstant(constant=sigmoid_bias),
        gate_nonlinearity(),
    ]
    reset_block = [  # r_t
        candidate_transform(),
        core.AddConstant(
            constant=sigmoid_bias),  # Want bias to start positive.
        gate_nonlinearity(),
    ]
    candidate_block = [
        cb.Dup(),
        reset_block,
        cb.Multiply(),  # Gate S{t-1} with sigmoid(candidate_transform(S{t-1}))
        candidate_transform(),  # Final projection + tanh to get Ct
        candidate_nonlinearity(),  # Candidate gate

        # Only apply dropout on the C gate. Paper reports 0.1 as a good default.
        core.Dropout(rate=dropout_rate_c)
    ]
    memory_transform = memory_transform_fn() if memory_transform_fn else []
    return cb.Model(
        cb.Dup(),
        cb.Dup(),
        cb.Parallel(memory_transform, gate_block, candidate_block),
        cb.Gate(),
    )
 def test_parallel_dup_dup(self):
     layer = cb.Parallel(cb.Dup(), cb.Dup())
     input_shape = ((3, 2), (4, 7))
     expected_shape = ((3, 2), (3, 2), (4, 7), (4, 7))
     output_shape = base.check_shape_agreement(layer, input_shape)
     self.assertEqual(output_shape, expected_shape)
 def test_serial_dup_dup(self):
     layer = cb.Serial(cb.Dup(), cb.Dup())
     input_shape = (3, 2)
     expected_shape = ((3, 2), (3, 2), (3, 2))
     output_shape = base.check_shape_agreement(layer, input_shape)
     self.assertEqual(output_shape, expected_shape)