Ejemplo n.º 1
0
#
# They take inputs, compute functions/custom calculations and return outputs.
#
# You can also inspect layer properties. Let me show you some examples.
#

# %% [markdown]
# ### Relu Layer
# First let's see how to build a relu activation function as a layer. A layer like this is one of the simplest types. Notice there is no object initialization so it works just like a math function.
#
# **Note: Activation functions are also layers in Trax, which might look odd if you have been using other frameworks for a longer time.**

# %% tags=[]
# Layers
# Create a relu trax layer
relu = tl.Relu()

# Inspect properties
print("-- Properties --")
print("name :", relu.name)
print("expected inputs :", relu.n_in)
print("promised outputs :", relu.n_out, "\n")

# Inputs
x = np.array([-2, -1, 0, 1, 2])
print("-- Inputs --")
print("x :", x, "\n")

# Outputs
y = relu(x)
print("-- Outputs --")
Ejemplo n.º 2
0
def ReformerShortenLM(vocab_size,
                      shorten_factor=1,
                      d_embedding=256,
                      d_model=512,
                      d_ff=2048,
                      d_attention_key=64,
                      d_attention_value=64,
                      n_layers=6,
                      n_heads=8,
                      dropout=0.1,
                      max_len=2048,
                      n_attention_chunks=1,
                      attention_type=tl.DotProductCausalAttention,
                      share_qk=False,
                      axial_pos_shape=(),
                      d_axial_pos_embs=None,
                      ff_activation=tl.FastGelu,
                      ff_use_sru=0,
                      ff_chunk_size=0,
                      mode='train'):
  """Reversible transformer language model with shortening.

  When shorten_factor is F and processing an input of shape [batch, length],
  we embed the (shifted-right) input and then group each F elements (on length)
  into a single vector -- so that in the end we process a tensor of shape
    [batch, length // F, d_model]
  almost until the end -- at the end it's un-shortend and a SRU is applied.
  This reduces the length processed inside the main model body, effectively
  making the model faster but possibly slightly less accurate.

  Args:
    vocab_size: int: vocab size
    shorten_factor: by how much to shorten, see above
    d_embedding: the depth of the embedding layer and final logits
    d_model: int:  depth of *each half* of the two-part features
    d_ff: int: depth of feed-forward layer
    d_attention_key: int: depth of key vector for each attention head
    d_attention_value: int: depth of value vector for each attention head
    n_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    max_len: int: maximum symbol length for positional encoding
    n_attention_chunks: int: number of chunks for attention
    attention_type: class: attention class to use, such as DotProductAttention.
    share_qk: bool, whether to share queries and keys.
    axial_pos_shape: tuple of ints: input shape to use for the axial position
      encoding. If unset, axial position encoding is disabled.
    d_axial_pos_embs: tuple of ints: depth of position embedding for each axis.
      Tuple length must match axial_pos_shape, values must sum to d_embedding.
    ff_activation: the non-linearity in feed-forward layer
    ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward
    ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks
    mode: str: 'train' or 'eval'

  Returns:
    the layer.
  """
  assert mode != 'predict'  # TODO(lukaszkaiser,kitaev): fast inference

  if not axial_pos_shape:
    positional_encoding = tl.PositionalEncoding(
        max_len=max_len, dropout=dropout, mode=mode)
  else:
    assert d_axial_pos_embs is not None
    positional_encoding = tl.AxialPositionalEncoding(
        shape=axial_pos_shape, d_embs=d_axial_pos_embs,
        dropout_broadcast_dims=tuple(range(1, len(axial_pos_shape) + 1)),
        dropout=dropout, mode=mode)

  positional_embedder = [
      tl.Embedding(d_embedding, vocab_size),
      BroadcastedDropout(rate=dropout, mode=mode),  # pylint: disable=no-value-for-parameter
      positional_encoding,
  ]

  decoder_blocks = []

  if isinstance(attention_type, (tuple, list)):
    assert n_layers % len(attention_type) == 0
  else:
    attention_type = [attention_type]
  for layer_idx in range(n_layers):
    layer_attention_type = attention_type[layer_idx % len(attention_type)]
    decoder_block = DecoderBlock(
        d_model, d_ff, d_attention_key, d_attention_value, n_heads,
        n_attention_chunks,
        attention_type=layer_attention_type,
        dropout=dropout,
        share_qk=(share_qk or issubclass(layer_attention_type,
                                         tl.LSHCausalAttention)),
        ff_activation=ff_activation,
        ff_use_sru=ff_use_sru,
        ff_chunk_size=ff_chunk_size,
        mode=mode)
    decoder_blocks.append(decoder_block)

  # pylint: disable=g-long-lambda
  return tl.Serial(
      tl.ShiftRight(),
      positional_embedder,
      tl.Dup(),              # Stack has (x, x), the first will be shortened
      # Before shortening, we need to pad by shorten factor so as not to leak
      # information into the future. To understand why, imagine shorten factor
      # of 2 and sequence of length 4, so ABCD. If we shift just by 1, then we
      # would have 0ABC, which gets grouped to [0A][BC] on input, which is
      # predicting ABCD as targets. The problem is that [0A] has access to A
      # and [BC] has access to C -- it will learn to copy it, peek into
      # the future. Shifting twice to [00][AB] solves the problem as the first
      # "big" symbol becomes all-0 and the rest is shifted enough.
      tl.ShiftRight(n_shifts=shorten_factor - 1),
      tl.Fn(lambda x: np.reshape(  # Shorten -- move to depth.
          x, (x.shape[0], x.shape[1] // shorten_factor, -1)), n_out=1),
      tl.Dense(d_model),
      tl.Dup(),  # Stack has (short_x, short_x, x)
      tl.ReversibleSerial(decoder_blocks),
      tl.Select([0], n_in=2),
      tl.LayerNorm(),
      BroadcastedDropout(rate=dropout, mode=mode),  # pylint: disable=no-value-for-parameter
      tl.Dense(shorten_factor * d_embedding),
      tl.Fn(lambda x: np.reshape(  # Prolong back.
          x, (x.shape[0], x.shape[1] * shorten_factor, -1)), n_out=1),
      tl.Concatenate(),  # Concatenate with just the embeddings.
      tl.CausalConv(d_embedding),
      tl.Relu(),
      tl.SRU(d_embedding),  # One RNN layer for conditional dependence.
      tl.Dense(vocab_size),
      tl.LogSoftmax()
  )
Ejemplo n.º 3
0
 def test_policytrainer_save_restore(self):
     """Check save and restore of policy trainer."""
     task = rl_task.RLTask('CartPole-v0',
                           initial_trajectories=10,
                           max_steps=200)
     model = functools.partial(
         models.Policy,
         body=lambda mode: tl.Serial(  # pylint: disable=g-long-lambda
             tl.Dense(64), tl.Relu(), tl.Dense(64), tl.Relu()),
     )
     tmp_dir = self.create_tempdir().full_path
     trainer1 = training.PolicyGradientTrainer(
         task,
         policy_model=model,
         policy_optimizer=opt.Adam,
         policy_batch_size=128,
         policy_train_steps_per_epoch=1,
         n_trajectories_per_epoch=2,
         n_eval_episodes=1,
         output_dir=tmp_dir)
     trainer1.run(1)
     trainer1.run(1)
     self.assertEqual(trainer1.current_epoch, 2)
     self.assertEqual(trainer1._policy_trainer.step, 2)
     # Trainer 2 starts where trainer 1 stopped.
     trainer2 = training.PolicyGradientTrainer(
         task,
         policy_model=model,
         policy_optimizer=opt.Adam,
         policy_batch_size=128,
         policy_train_steps_per_epoch=1,
         n_trajectories_per_epoch=2,
         n_eval_episodes=1,
         output_dir=tmp_dir)
     trainer2.run(1)
     self.assertEqual(trainer2.current_epoch, 3)
     self.assertEqual(trainer2._policy_trainer.step, 3)
     # Trainer 3 has 2x steps-per-epoch, but epoch 3, should raise an error.
     trainer3 = training.PolicyGradientTrainer(
         task,
         policy_model=model,
         policy_optimizer=opt.Adam,
         policy_batch_size=128,
         policy_train_steps_per_epoch=2,
         n_trajectories_per_epoch=2,
         n_eval_episodes=1,
         output_dir=tmp_dir)
     self.assertRaises(ValueError, trainer3.run)
     # Manually set saved epoch to 1.
     dictionary = {
         'epoch': 1,
         'avg_returns': [0.0],
         'avg_returns_temperature0': {
             200: [0.0]
         }
     }
     with tf.io.gfile.GFile(os.path.join(tmp_dir, 'rl.pkl'), 'wb') as f:
         pickle.dump(dictionary, f)
     # Trainer 3 still should fail as steps between evals are 2, cannot do 1.
     self.assertRaises(ValueError, trainer3.run)
     # Trainer 4 does 1 step per eval, should train 1 step in epoch 2.
     trainer4 = training.PolicyGradientTrainer(
         task,
         policy_model=model,
         policy_optimizer=opt.Adam,
         policy_batch_size=128,
         policy_train_steps_per_epoch=2,
         policy_evals_per_epoch=2,
         n_trajectories_per_epoch=2,
         n_eval_episodes=1,
         output_dir=tmp_dir)
     trainer4.run(1)
     self.assertEqual(trainer4.current_epoch, 2)
     self.assertEqual(trainer4._policy_trainer.step, 4)
     trainer1.close()
     trainer2.close()
     trainer3.close()
     trainer4.close()
Ejemplo n.º 4
0
 def test_relu(self):
     layer = tl.Relu()
     x = np.array([-2.0, -1.0, 0.0, 2.0, 3.0, 5.0])
     y = layer(x)
     self.assertEqual(tl.to_list(y), [0.0, 0.0, 0.0, 2.0, 3.0, 5.0])
Ejemplo n.º 5
0
# # Creating a GRU model using Trax: Ungraded Lecture Notebook

# For this lecture notebook you will be using Trax's layers. These are the building blocks for creating neural networks with Trax.

# In[1]:

import trax
from trax import layers as tl

# Trax allows to define neural network architectures by stacking layers (similarly to other libraries such as Keras). For this the `Serial()` is often used as it is a combinator that allows to stack layers serially using function composition.
#
# Next you can see a simple vanilla NN architecture containing 1 hidden(dense) layer with 128 cells and output (dense) layer with 10 cells on which we apply the final layer of logsoftmax.

# In[2]:

mlp = tl.Serial(tl.Dense(128), tl.Relu(), tl.Dense(10), tl.LogSoftmax())

# Each of the layers within the `Serial` combinator layer is considered a sublayer. Notice that unlike similar libraries, **in Trax the activation functions are considered layers.** To know more about the `Serial` layer check the docs [here](https://trax-ml.readthedocs.io/en/latest/trax.layers.html#trax.layers.combinators.Serial).
#
# You can try printing this object:

# In[3]:

print(mlp)

# Printing the model gives you the exact same information as the model's definition itself.
#
# By just looking at the definition you can clearly see what is going on inside the neural network. Trax is very straightforward in the way a network is defined, that is one of the things that makes it awesome!

# ## GRU MODEL
Ejemplo n.º 6
0
def LatentTransformer(input_vocab_size,
                      output_vocab_size=None,
                      d_model=512,
                      d_ff=2048,
                      n_encoder_layers=6,
                      n_decoder_layers=6,
                      n_heads=8,
                      dropout=0.1,
                      dropout_shared_axes=None,
                      max_len=2048,
                      mode='train',
                      ff_activation=tl.Relu,
                      axial_pos_shape=None,
                      d_axial_pos_embs=None):
    """Returns a Transformer model.

  This model expects an input pair: target, source.

  Args:
    input_vocab_size: int: vocab size of the source.
    output_vocab_size: int (optional): vocab size of the target. If None, the
      source and target are assumed to have the same vocab.
    d_model: int:  depth of embedding
    d_ff: int: depth of feed-forward layer
    n_encoder_layers: int: number of encoder layers
    n_decoder_layers: int: number of decoder layers
    n_heads: int: number of attention heads
    dropout: float: dropout rate (how much to drop out)
    dropout_shared_axes: axes on which to share dropout mask
    max_len: int: maximum symbol length for positional encoding
    mode: str: 'train' or 'eval'
    ff_activation: the non-linearity in feed-forward layer
    axial_pos_shape: tuple of ints: input shape to use for the axial position
      encoding. If unset, axial position encoding is disabled.
    d_axial_pos_embs: tuple of ints: depth of position embedding for each axis.
      Tuple length must match axial_pos_shape, and values must sum to d_model.

  Returns:
    A Transformer model as a layer that maps from a target, source pair to
    activations over a vocab set.
  """
    in_encoder, out_encoder, output_vocab_size = (
        ct.EmbeddingAndPositionalEncodings(input_vocab_size,
                                           d_model,
                                           mode,
                                           dropout,
                                           dropout_shared_axes,
                                           max_len,
                                           output_vocab_size=output_vocab_size,
                                           axial_pos_shape=axial_pos_shape,
                                           d_axial_pos_embs=d_axial_pos_embs))

    encoder_blocks = [
        _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                      mode, ff_activation) for i in range(n_encoder_layers)
    ]

    encoder = tl.Serial(in_encoder, encoder_blocks, tl.LayerNorm())
    if mode == 'predict':
        encoder = tl.Cache(encoder)

    decoder_blocks = [
        _DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes,
                      mode, ff_activation) for i in range(n_decoder_layers)
    ]

    compress_seq = tl.Serial(
        # input:                            #   tok
        tl.Branch([], tl.PaddingMask()),  #   tok mask
        encoder,  #   vec mask
        PickFirst(),  # vec_f mask
        tl.Select([0], n_in=2))  # vec_f

    latent_transition = tl.Serial(
        tl.Parallel([tl.Dense(d_model), tl.Relu()],
                    [tl.Dense(d_model), tl.Relu()]), tl.Add(),
        tl.Residual(
            tl.LayerNorm(),
            tl.Dense(d_model),
            tl.Relu(),
            tl.Dropout(rate=dropout, mode=mode),
            tl.Dense(d_model),
        ))

    pred_valid = tl.Serial(tl.Dense(2), Squeeze(1))

    embed_tgt = tl.Serial(
        # Input                             #  tok_d
        DropLast(mode=mode),  # stok_d
        out_encoder,  # svec_d
    )

    decode_seq = tl.Serial(
        # Input:                                 #  vec_e  tok_d
        tl.Select([1, 0, 1]),  #  tok_d  vec_e tok_d
        tl.Parallel(embed_tgt, [], DropFirst()),  # svec_d  vec_e tok_d'
        ConcatDeEntoEnDe(),  # vec_ed tok_d'
        # Decoder blocks with causal attention
        decoder_blocks,  # vec_ed tok_d'
        tl.LayerNorm(),  # vec_ed tok_d'
        DropFirst(),  #  vec_d tok_d'
        # Map to output vocab.
        tl.Dense(output_vocab_size),  # pred_d tok_d'
    )

    # compress_seq: n_in 1 n_out 1: add mask, encode, pick last hidden
    # latent_transition: n_in 2 n_out 1: s, a -> s_1
    # pred_valid: n_in 1 n_out 1: s_1 -> pred_v
    # decode_seq: n_in 2 n_out 2: copy target, shift right, decode, output

    return tl.Serial(
        #       0      1      2      3      4     5      6 7 8
        # Input:                                #   tok_s  tok_a tok_s1      r      v
        tl.Select([0, 1, 2, 0, 1, 3,
                   4]),  #   tok_s  tok_a tok_s1  tok_s  tok_a     r      v

        # Encode.
        tl.Parallel(
            compress_seq,
            compress_seq),  #   vec_s  vec_a tok_s1  tok_s  tok_a     r      v
        tl.Branch(latent_transition, [], tl.Select(
            [1],
            n_in=2)),  #  vec_s1  vec_s  vec_a tok_s1  tok_s tok_a      r v
        tl.Branch(pred_valid,
                  []),  #  pred_v vec_s1  vec_s  vec_a tok_s1 tok_s  tok_a r v
        # Decode.
        tl.Select([1, 4, 2, 5, 3, 6, 0, 8,
                   7]),  #  vec_s1 tok_s1  vec_s  tok_s  vec_a tok_a pred_v v r
        tl.Parallel(decode_seq, decode_seq, decode_seq
                    ),  # pred_s1 tok_s1 pred_s  tok_s pred_a tok_a pred_v v r
    )
Ejemplo n.º 7
0
def ResidualFeedForward(d_model, d_ff, dropout, mode):
    """Residual feed-forward layer with normalization at start."""
    stack = tl.Serial(tl.LayerNorm(), tl.Dense(d_ff), tl.Relu(),
                      tl.Dropout(rate=dropout, mode=mode), tl.Dense(d_model),
                      tl.Dropout(rate=dropout, mode=mode))
    return tl.Residual(PreservePosition(stack))