def test_warp(): B, T, D = get_dim_vars('b t d') x: 'btd' = np.ones((B, T, D)) # two view transformations (reshapes) in sequence x1 = warp(x, 'btd -> b,t,4,d//4 -> b*t,4,d//4', 'vv', debug=False) assert (x1.shape == (B * T, 4, D // 4)) # four reshapes in sequence x2 = warp(x, 'btd -> b,t,4,d//4 -> b*t,4,d//4 -> b,t,4,d//4 -> btd', 'vvvv', debug=False) assert (x2.shape == (B, T, D)) # Same reshape sequence in shorthand, specified as list of transformations x2 = warp(x, [ '__d -> ,,4,d//4', 'b,t,, -> b*t,,', 'b*t,, -> b,t,,', ',,4,d//4 -> ,,d' ], 'vvvv', debug=True) assert (x2.shape == (B, T, D)) print('test_warp: all assertions hold')
def warp_long1(): B, T, D, C = get_dim_vars('b t d c') x1: 'btd' = np.ones((B, T, D)) x2: 'btd' = np.ones((B, T, D)) x3: 'btd' = np.ones((B, T, D)) y = warp([x1, x2, x3], '(btd)* -> btdc -> bdtc -> b,d//2,t*2,c', 'jpv') assert y.shape == (B, D // 2, T * 2, C) print('warp_long1: all assertions hold')
def test_warp(): x: 'btd' = np.ones((B, T, D)) #x = warp(x, 'btd -> b,t,4,d//4 -> b*t,4,d//4', 'vv', debug=True) #assert(x.shape == (B*T,4,D//4)) x = warp(x, 'btd -> b,t,4,d//4 -> b*t,4,d//4 -> b,t,4,d//4 -> btd', 'vvvv', debug=False) assert (x.shape == (B, T, D)) import torch y: 'btd' = torch.randn(B, T, D) y = warp(y, 'btd -> b,t,4,d//4 -> b,4,t,d//4', 'vp', debug=False) assert (y.shape == (B, 4, T, D // 4)) print('test_warp: all assertions hold')
def test_warp_pytorch(): B, T, D = get_dim_vars('b t d') import torch y: 'btd' = torch.randn(B, T, D) #a reshape followed by permute y = warp(y, 'btd -> b,t,4,d//4 -> b,4,t,d//4', 'vp', debug=False) assert (y.shape == (B, 4, T, D // 4)) print('test_warp_pytorch: all assertions hold')
def embedding_lookup(input_ids, vocab_size, embedding_size=128, initializer_range=0.02, word_embedding_name="word_embeddings", use_one_hot_embeddings=False): """Looks up words embeddings for id tensor. Args: input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids. vocab_size: int. Size of the embedding vocabulary. embedding_size: int. Width of the word embeddings. initializer_range: float. Embedding initialization range. word_embedding_name: string. Name of the embedding table. use_one_hot_embeddings: bool. If True, use one-hot method for word embeddings. If False, use `tf.nn.embedding_lookup()`. One hot is better for TPUs. Returns: float Tensor of shape [batch_size, seq_length, embedding_size]. """ # This function assumes that the input is of shape [batch_size, seq_length, # num_inputs]. # # If the input is a 2D tensor of shape [batch_size, seq_length], we # reshape to [batch_size, seq_length, 1]. if input_ids.shape.ndims == 2: input_ids = tf.expand_dims(input_ids, axis=[-1]) B, T, D = get_dim_vars('b t d') input_ids: 'bti' #i : num of inputs #TODO: define/pickup i from input_ids i = get_shape_list(input_ids)[-1] embedding_table: 'vd' = tf.get_variable( name=word_embedding_name, shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range)) if use_one_hot_embeddings: flat_input_ids: 'b*t*i' = tf.reshape(input_ids, [-1]) one_hot_input_ids: 'b*t*i,v' = tf.one_hot(flat_input_ids, depth=vocab_size) output: 'b*t*i,d' = tf.matmul(one_hot_input_ids, embedding_table) else: output = tf.nn.embedding_lookup(embedding_table, input_ids) #input_shape: 'bti' = get_shape_list(input_ids) output: 'btd' = warp(output, tfms=f'b*t*{i},d -> b,t,d*{i}', tfm_names='r') return (output, embedding_table)
def warp_long2(): B, T, D, C = get_dim_vars('b t d c') x1: 'btd' = np.ones((B, T, D)) y = warp(x1, 'btd -> btd1 -> bdt1 -> b,d//2,t*2,1', 'apv') assert y.shape == (B, D // 2, T * 2, 1) print('warp_long2: all assertions hold')
def merge_heads(self, x: (B, H, T, D)): # pylint: disable=no-self-use res = warp(x, 'bhtd -> bthd -> b,t,h*d', 'pcv') #permute, then contiguous, then view transforms return res
def merge_heads2(x: (B,H,T,D)): res: (B,T,H*D) = warp(x, 'bhtd -> bthd -> b,t,h*d', 'pcv', debug=False) return res
def transformer_model(input_tensor: 'btd', attention_mask: 'btt' = None, hidden_size: 'd' = 768, num_hidden_layers: 'l' = 12, num_attention_heads: 'h' = 4, intermediate_size: 's' = 3072, intermediate_act_fn=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): """Multi-headed, multi-layer Transformer from "Attention is All You Need". This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) input_shape = get_shape_list(input_tensor, expected_rank=3) #batch_size = input_shape[0] #seq_length = input_shape[1] #input_width = input_shape[2] B, T, D = input_shape batch_size, seq_length, input_width = B, T, D # The Transformer performs sum residuals on all layers so the input needs # to be the same as the hidden size. if D != hidden_size: raise ValueError( "The width of the input tensor (%d) != hidden size (%d)" % (D, hidden_size)) # We keep the representation as a 2D tensor to avoid re-shaping it back and # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on # the GPU/CPU but may not be free on the TPU, so we want to minimize them to # help the optimizer. #prev_output: 'b*t,d' = reshape_to_matrix(input_tensor) prev_output: 'b*t,d' = warp(input_tensor, 'btd -> b*t,d', 'v') size_assert(get_shape_list(prev_output), (B * T, D)) all_layer_outputs: '(btd)*' = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer_idx): layer_input: 'b*t,d' = prev_output with tf.variable_scope("attention"): attention_heads: '(b*t,d)*' = [] with tf.variable_scope("self"): attention_head: 'b*t,d' = attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob= attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output: 'b*t,d' = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output: 'b*t,d' = tf.layers.dense( attention_output, hidden_size, kernel_initializer=create_initializer( initializer_range)) attention_output = dropout(attention_output, hidden_dropout_prob) attention_output = layer_norm(attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output: 'b*t,s' = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=create_initializer(initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output: 'b*t,d' = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=create_initializer(initializer_range)) layer_output = dropout(layer_output, hidden_dropout_prob) layer_output = layer_norm(layer_output + attention_output) prev_output: 'b*t,d' = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: final_outputs: '(btd)*' = [] for layer_output in all_layer_outputs: #final_output: 'btd' = reshape_from_matrix(layer_output, input_shape) final_output: 'btd' = warp(layer_output, 'b*t,d -> btd', 'r') final_outputs.append(final_output) return final_outputs else: final_output: 'btd' = warp(layer_output, 'b*t,d -> btd', 'r') return final_output
def attention_layer(from_tensor: 'b*t,d', to_tensor: 'b*t,d', attention_mask: 'b,t,t' = None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, do_return_2d_tensor=False, batch_size=None, from_seq_length=None, to_seq_length=None): """Performs multi-headed attention from `from_tensor` to `to_tensor`. This is an implementation of multi-headed attention based on "Attention is all you Need". If `from_tensor` and `to_tensor` are the same, then this is self-attention. Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector. This function first projects `from_tensor` into a "query" tensor and `to_tensor` into "key" and "value" tensors. These are (effectively) a list of tensors of length `num_attention_heads`, where each tensor is of shape [batch_size, seq_length, size_per_head]. Then, the query and key tensors are dot-producted and scaled. These are softmaxed to obtain attention probabilities. The value tensors are then interpolated by these probabilities, then concatenated back to a single tensor and returned. In practice, the multi-headed attention are done with transposes and reshapes rather than actual separate tensors. Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. size_per_head: int. Size of each attention head. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. do_return_2d_tensor: bool. If True, the output will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]. If False, the output will be of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is true, this will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]). Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B(b) = batch size (number of sequences) # F(f) = `from_tensor` sequence length # T(t) = `to_tensor` sequence length # N(n) = `num_attention_heads` # H(h) = `size_per_head` #from_tensor_2d: 'b*t,d' = reshape_to_matrix(from_tensor) #to_tensor_2d: 'b*t,d' = reshape_to_matrix(to_tensor) from_tensor_2d: 'b*t,d' = from_tensor to_tensor_2d: 'b*t,d' = to_tensor query_layer: 'b*t,d' = tf.layers.dense( from_tensor_2d, num_attention_heads * size_per_head, activation=query_act, name="query", kernel_initializer=create_initializer(initializer_range)) key_layer: 'b*t,d' = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=key_act, name="key", kernel_initializer=create_initializer(initializer_range)) value_layer: 'b*t,d' = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=value_act, name="value", kernel_initializer=create_initializer(initializer_range)) query_layer: 'bnth' = warp(query_layer, 'b*t,d -> btnh -> bnth', 'vp') key_layer: 'bnth' = warp(key_layer, 'b*t,d -> btnh -> bnth', 'vp') # Take the dot product between "query" and "key" to get the raw # attention scores. attention_scores: 'bntt' = tf.matmul(query_layer, key_layer, transpose_b=True) attention_scores: 'bntt' = tf.multiply( attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] #attention_mask = tf.expand_dims(attention_mask, axis=[1]) attention_mask = alignto((attention_mask, 'btt'), 'bntt') # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder: 'bntt' = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_probs: 'bntt' = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs: 'bntt' = dropout(attention_probs, attention_probs_dropout_prob) value_layer: 'bnth' = warp(value_layer, 'b*t,n*h -> btnh -> bnth', 'vp') context_layer: 'bnth' = tf.matmul( attention_probs, value_layer) #bntt,bnth->bnth OR ___t,__t_ if do_return_2d_tensor: context_layer = warp(context_layer, 'bnth->btnh->b*t,n*h', 'pv') else: context_layer = warp(context_layer, 'bnth->btnh->b,t,n*h', 'pv') return context_layer