# # They take inputs, compute functions/custom calculations and return outputs. # # You can also inspect layer properties. Let me show you some examples. # # %% [markdown] # ### Relu Layer # First let's see how to build a relu activation function as a layer. A layer like this is one of the simplest types. Notice there is no object initialization so it works just like a math function. # # **Note: Activation functions are also layers in Trax, which might look odd if you have been using other frameworks for a longer time.** # %% tags=[] # Layers # Create a relu trax layer relu = tl.Relu() # Inspect properties print("-- Properties --") print("name :", relu.name) print("expected inputs :", relu.n_in) print("promised outputs :", relu.n_out, "\n") # Inputs x = np.array([-2, -1, 0, 1, 2]) print("-- Inputs --") print("x :", x, "\n") # Outputs y = relu(x) print("-- Outputs --")
def ReformerShortenLM(vocab_size, shorten_factor=1, d_embedding=256, d_model=512, d_ff=2048, d_attention_key=64, d_attention_value=64, n_layers=6, n_heads=8, dropout=0.1, max_len=2048, n_attention_chunks=1, attention_type=tl.DotProductCausalAttention, share_qk=False, axial_pos_shape=(), d_axial_pos_embs=None, ff_activation=tl.FastGelu, ff_use_sru=0, ff_chunk_size=0, mode='train'): """Reversible transformer language model with shortening. When shorten_factor is F and processing an input of shape [batch, length], we embed the (shifted-right) input and then group each F elements (on length) into a single vector -- so that in the end we process a tensor of shape [batch, length // F, d_model] almost until the end -- at the end it's un-shortend and a SRU is applied. This reduces the length processed inside the main model body, effectively making the model faster but possibly slightly less accurate. Args: vocab_size: int: vocab size shorten_factor: by how much to shorten, see above d_embedding: the depth of the embedding layer and final logits d_model: int: depth of *each half* of the two-part features d_ff: int: depth of feed-forward layer d_attention_key: int: depth of key vector for each attention head d_attention_value: int: depth of value vector for each attention head n_layers: int: number of decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) max_len: int: maximum symbol length for positional encoding n_attention_chunks: int: number of chunks for attention attention_type: class: attention class to use, such as DotProductAttention. share_qk: bool, whether to share queries and keys. axial_pos_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. d_axial_pos_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match axial_pos_shape, values must sum to d_embedding. ff_activation: the non-linearity in feed-forward layer ff_use_sru: int; if > 0, we use this many SRU layers instead of feed-forward ff_chunk_size: int; if > 0, chunk feed-forward into this-sized chunks mode: str: 'train' or 'eval' Returns: the layer. """ assert mode != 'predict' # TODO(lukaszkaiser,kitaev): fast inference if not axial_pos_shape: positional_encoding = tl.PositionalEncoding( max_len=max_len, dropout=dropout, mode=mode) else: assert d_axial_pos_embs is not None positional_encoding = tl.AxialPositionalEncoding( shape=axial_pos_shape, d_embs=d_axial_pos_embs, dropout_broadcast_dims=tuple(range(1, len(axial_pos_shape) + 1)), dropout=dropout, mode=mode) positional_embedder = [ tl.Embedding(d_embedding, vocab_size), BroadcastedDropout(rate=dropout, mode=mode), # pylint: disable=no-value-for-parameter positional_encoding, ] decoder_blocks = [] if isinstance(attention_type, (tuple, list)): assert n_layers % len(attention_type) == 0 else: attention_type = [attention_type] for layer_idx in range(n_layers): layer_attention_type = attention_type[layer_idx % len(attention_type)] decoder_block = DecoderBlock( d_model, d_ff, d_attention_key, d_attention_value, n_heads, n_attention_chunks, attention_type=layer_attention_type, dropout=dropout, share_qk=(share_qk or issubclass(layer_attention_type, tl.LSHCausalAttention)), ff_activation=ff_activation, ff_use_sru=ff_use_sru, ff_chunk_size=ff_chunk_size, mode=mode) decoder_blocks.append(decoder_block) # pylint: disable=g-long-lambda return tl.Serial( tl.ShiftRight(), positional_embedder, tl.Dup(), # Stack has (x, x), the first will be shortened # Before shortening, we need to pad by shorten factor so as not to leak # information into the future. To understand why, imagine shorten factor # of 2 and sequence of length 4, so ABCD. If we shift just by 1, then we # would have 0ABC, which gets grouped to [0A][BC] on input, which is # predicting ABCD as targets. The problem is that [0A] has access to A # and [BC] has access to C -- it will learn to copy it, peek into # the future. Shifting twice to [00][AB] solves the problem as the first # "big" symbol becomes all-0 and the rest is shifted enough. tl.ShiftRight(n_shifts=shorten_factor - 1), tl.Fn(lambda x: np.reshape( # Shorten -- move to depth. x, (x.shape[0], x.shape[1] // shorten_factor, -1)), n_out=1), tl.Dense(d_model), tl.Dup(), # Stack has (short_x, short_x, x) tl.ReversibleSerial(decoder_blocks), tl.Select([0], n_in=2), tl.LayerNorm(), BroadcastedDropout(rate=dropout, mode=mode), # pylint: disable=no-value-for-parameter tl.Dense(shorten_factor * d_embedding), tl.Fn(lambda x: np.reshape( # Prolong back. x, (x.shape[0], x.shape[1] * shorten_factor, -1)), n_out=1), tl.Concatenate(), # Concatenate with just the embeddings. tl.CausalConv(d_embedding), tl.Relu(), tl.SRU(d_embedding), # One RNN layer for conditional dependence. tl.Dense(vocab_size), tl.LogSoftmax() )
def test_policytrainer_save_restore(self): """Check save and restore of policy trainer.""" task = rl_task.RLTask('CartPole-v0', initial_trajectories=10, max_steps=200) model = functools.partial( models.Policy, body=lambda mode: tl.Serial( # pylint: disable=g-long-lambda tl.Dense(64), tl.Relu(), tl.Dense(64), tl.Relu()), ) tmp_dir = self.create_tempdir().full_path trainer1 = training.PolicyGradientTrainer( task, policy_model=model, policy_optimizer=opt.Adam, policy_batch_size=128, policy_train_steps_per_epoch=1, n_trajectories_per_epoch=2, n_eval_episodes=1, output_dir=tmp_dir) trainer1.run(1) trainer1.run(1) self.assertEqual(trainer1.current_epoch, 2) self.assertEqual(trainer1._policy_trainer.step, 2) # Trainer 2 starts where trainer 1 stopped. trainer2 = training.PolicyGradientTrainer( task, policy_model=model, policy_optimizer=opt.Adam, policy_batch_size=128, policy_train_steps_per_epoch=1, n_trajectories_per_epoch=2, n_eval_episodes=1, output_dir=tmp_dir) trainer2.run(1) self.assertEqual(trainer2.current_epoch, 3) self.assertEqual(trainer2._policy_trainer.step, 3) # Trainer 3 has 2x steps-per-epoch, but epoch 3, should raise an error. trainer3 = training.PolicyGradientTrainer( task, policy_model=model, policy_optimizer=opt.Adam, policy_batch_size=128, policy_train_steps_per_epoch=2, n_trajectories_per_epoch=2, n_eval_episodes=1, output_dir=tmp_dir) self.assertRaises(ValueError, trainer3.run) # Manually set saved epoch to 1. dictionary = { 'epoch': 1, 'avg_returns': [0.0], 'avg_returns_temperature0': { 200: [0.0] } } with tf.io.gfile.GFile(os.path.join(tmp_dir, 'rl.pkl'), 'wb') as f: pickle.dump(dictionary, f) # Trainer 3 still should fail as steps between evals are 2, cannot do 1. self.assertRaises(ValueError, trainer3.run) # Trainer 4 does 1 step per eval, should train 1 step in epoch 2. trainer4 = training.PolicyGradientTrainer( task, policy_model=model, policy_optimizer=opt.Adam, policy_batch_size=128, policy_train_steps_per_epoch=2, policy_evals_per_epoch=2, n_trajectories_per_epoch=2, n_eval_episodes=1, output_dir=tmp_dir) trainer4.run(1) self.assertEqual(trainer4.current_epoch, 2) self.assertEqual(trainer4._policy_trainer.step, 4) trainer1.close() trainer2.close() trainer3.close() trainer4.close()
def test_relu(self): layer = tl.Relu() x = np.array([-2.0, -1.0, 0.0, 2.0, 3.0, 5.0]) y = layer(x) self.assertEqual(tl.to_list(y), [0.0, 0.0, 0.0, 2.0, 3.0, 5.0])
# # Creating a GRU model using Trax: Ungraded Lecture Notebook # For this lecture notebook you will be using Trax's layers. These are the building blocks for creating neural networks with Trax. # In[1]: import trax from trax import layers as tl # Trax allows to define neural network architectures by stacking layers (similarly to other libraries such as Keras). For this the `Serial()` is often used as it is a combinator that allows to stack layers serially using function composition. # # Next you can see a simple vanilla NN architecture containing 1 hidden(dense) layer with 128 cells and output (dense) layer with 10 cells on which we apply the final layer of logsoftmax. # In[2]: mlp = tl.Serial(tl.Dense(128), tl.Relu(), tl.Dense(10), tl.LogSoftmax()) # Each of the layers within the `Serial` combinator layer is considered a sublayer. Notice that unlike similar libraries, **in Trax the activation functions are considered layers.** To know more about the `Serial` layer check the docs [here](https://trax-ml.readthedocs.io/en/latest/trax.layers.html#trax.layers.combinators.Serial). # # You can try printing this object: # In[3]: print(mlp) # Printing the model gives you the exact same information as the model's definition itself. # # By just looking at the definition you can clearly see what is going on inside the neural network. Trax is very straightforward in the way a network is defined, that is one of the things that makes it awesome! # ## GRU MODEL
def LatentTransformer(input_vocab_size, output_vocab_size=None, d_model=512, d_ff=2048, n_encoder_layers=6, n_decoder_layers=6, n_heads=8, dropout=0.1, dropout_shared_axes=None, max_len=2048, mode='train', ff_activation=tl.Relu, axial_pos_shape=None, d_axial_pos_embs=None): """Returns a Transformer model. This model expects an input pair: target, source. Args: input_vocab_size: int: vocab size of the source. output_vocab_size: int (optional): vocab size of the target. If None, the source and target are assumed to have the same vocab. d_model: int: depth of embedding d_ff: int: depth of feed-forward layer n_encoder_layers: int: number of encoder layers n_decoder_layers: int: number of decoder layers n_heads: int: number of attention heads dropout: float: dropout rate (how much to drop out) dropout_shared_axes: axes on which to share dropout mask max_len: int: maximum symbol length for positional encoding mode: str: 'train' or 'eval' ff_activation: the non-linearity in feed-forward layer axial_pos_shape: tuple of ints: input shape to use for the axial position encoding. If unset, axial position encoding is disabled. d_axial_pos_embs: tuple of ints: depth of position embedding for each axis. Tuple length must match axial_pos_shape, and values must sum to d_model. Returns: A Transformer model as a layer that maps from a target, source pair to activations over a vocab set. """ in_encoder, out_encoder, output_vocab_size = ( ct.EmbeddingAndPositionalEncodings(input_vocab_size, d_model, mode, dropout, dropout_shared_axes, max_len, output_vocab_size=output_vocab_size, axial_pos_shape=axial_pos_shape, d_axial_pos_embs=d_axial_pos_embs)) encoder_blocks = [ _EncoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for i in range(n_encoder_layers) ] encoder = tl.Serial(in_encoder, encoder_blocks, tl.LayerNorm()) if mode == 'predict': encoder = tl.Cache(encoder) decoder_blocks = [ _DecoderBlock(d_model, d_ff, n_heads, dropout, dropout_shared_axes, mode, ff_activation) for i in range(n_decoder_layers) ] compress_seq = tl.Serial( # input: # tok tl.Branch([], tl.PaddingMask()), # tok mask encoder, # vec mask PickFirst(), # vec_f mask tl.Select([0], n_in=2)) # vec_f latent_transition = tl.Serial( tl.Parallel([tl.Dense(d_model), tl.Relu()], [tl.Dense(d_model), tl.Relu()]), tl.Add(), tl.Residual( tl.LayerNorm(), tl.Dense(d_model), tl.Relu(), tl.Dropout(rate=dropout, mode=mode), tl.Dense(d_model), )) pred_valid = tl.Serial(tl.Dense(2), Squeeze(1)) embed_tgt = tl.Serial( # Input # tok_d DropLast(mode=mode), # stok_d out_encoder, # svec_d ) decode_seq = tl.Serial( # Input: # vec_e tok_d tl.Select([1, 0, 1]), # tok_d vec_e tok_d tl.Parallel(embed_tgt, [], DropFirst()), # svec_d vec_e tok_d' ConcatDeEntoEnDe(), # vec_ed tok_d' # Decoder blocks with causal attention decoder_blocks, # vec_ed tok_d' tl.LayerNorm(), # vec_ed tok_d' DropFirst(), # vec_d tok_d' # Map to output vocab. tl.Dense(output_vocab_size), # pred_d tok_d' ) # compress_seq: n_in 1 n_out 1: add mask, encode, pick last hidden # latent_transition: n_in 2 n_out 1: s, a -> s_1 # pred_valid: n_in 1 n_out 1: s_1 -> pred_v # decode_seq: n_in 2 n_out 2: copy target, shift right, decode, output return tl.Serial( # 0 1 2 3 4 5 6 7 8 # Input: # tok_s tok_a tok_s1 r v tl.Select([0, 1, 2, 0, 1, 3, 4]), # tok_s tok_a tok_s1 tok_s tok_a r v # Encode. tl.Parallel( compress_seq, compress_seq), # vec_s vec_a tok_s1 tok_s tok_a r v tl.Branch(latent_transition, [], tl.Select( [1], n_in=2)), # vec_s1 vec_s vec_a tok_s1 tok_s tok_a r v tl.Branch(pred_valid, []), # pred_v vec_s1 vec_s vec_a tok_s1 tok_s tok_a r v # Decode. tl.Select([1, 4, 2, 5, 3, 6, 0, 8, 7]), # vec_s1 tok_s1 vec_s tok_s vec_a tok_a pred_v v r tl.Parallel(decode_seq, decode_seq, decode_seq ), # pred_s1 tok_s1 pred_s tok_s pred_a tok_a pred_v v r )
def ResidualFeedForward(d_model, d_ff, dropout, mode): """Residual feed-forward layer with normalization at start.""" stack = tl.Serial(tl.LayerNorm(), tl.Dense(d_ff), tl.Relu(), tl.Dropout(rate=dropout, mode=mode), tl.Dense(d_model), tl.Dropout(rate=dropout, mode=mode)) return tl.Residual(PreservePosition(stack))