def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None, embedded_input=None): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. rue for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is must faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to "bert". embedded_input: (optional) If provided, the embedding layer here is skipped and the passed embeddings are passed into the self-attentional layers. Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.compat.v1.variable_scope("bert", scope): with tf.compat.v1.variable_scope("embeddings"): if embedded_input is None: # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) else: self.embedding_output = embedded_input # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.compat.v1.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = create_attention_mask_from_input_mask( input_ids, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.compat.v1.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.compat.v1.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=create_initializer( config.initializer_range))
def dense( inputs: tf.Tensor, units: int, activation: Optional[str], activation_noise: tf.Tensor, name: str, use_bias: bool, dropout_keep_rate: Optional[Union[float, tf.Tensor]] = None ) -> Tuple[tf.Tensor, tf.Tensor]: """ Creates a dense, feed-forward layer with the given parameters. Args: inputs: The input tensor. Has the shape [B, ..., D] units: The number of output units. Denoted by K. activation: Optional activation function. If none, the activation is linear. activation_noise: Noise scale to apply to the final activations name: Name prefix for the created trainable variables. use_bias: Whether to add a bias to the output. dropout_keep_rate: Optional dropout to apply to the activations Returns: A tuple of 2 elements: (1) the transformed inputs in a [B, ..., K] tensor and (2) the transformed inputs without the activation function. This second entry is included for debugging purposes. """ # Get the size of the input features, denoted by D input_units = inputs.get_shape()[-1] # Create the weight matrix W = tf.compat.v1.get_variable( name='{0}-kernel'.format(name), shape=[input_units, units], initializer=tf.compat.v1.initializers.glorot_uniform(), trainable=True) # Apply the given weights transformed = tf.matmul(inputs, W) # [B, ..., K] # Add the bias if specified if use_bias: # Bias vector of size [K] b = tf.compat.v1.get_variable( name='{0}-bias'.format(name), shape=[1, units], initializer=tf.compat.v1.initializers.random_uniform(minval=-0.7, maxval=0.7), trainable=True) transformed = transformed + b pre_activation = transformed # Apply the activation function if specified activation_fn = get_activation(activation) if activation_fn is not None: transformed = activation_fn(transformed) # Apply noise regularization transformed = apply_noise(transformed, scale=activation_noise) if dropout_keep_rate is not None: transformed = tf.nn.dropout(transformed, rate=1.0 - dropout_keep_rate) return transformed, pre_activation
def transformer_model(input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=get_activation('gelu'), hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): """Multi-headed, multi-layer Transformer from "Attention is All You Need". This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] # The Transformer performs sum residuals on all layers so the input needs # to be the same as the hidden size. if input_width != hidden_size: raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % (input_width, hidden_size)) # We keep the representation as a 2D tensor to avoid re-shaping it back and # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on # the GPU/CPU but may not be free on the TPU, so we want to minimize them to # help the optimizer. prev_output = reshape_to_matrix(input_tensor) all_layer_outputs = [] for layer_idx in range(num_hidden_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): layer_input = prev_output with tf.compat.v1.variable_scope("attention"): attention_heads = [] with tf.compat.v1.variable_scope("self"): attention_head = attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.compat.v1.variable_scope("output"): attention_output = tf.compat.v1.layers.dense( attention_output, hidden_size, kernel_initializer=create_initializer(initializer_range)) attention_output = dropout(attention_output, hidden_dropout_prob) attention_output = layer_norm(attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.compat.v1.variable_scope("intermediate"): intermediate_output = tf.compat.v1.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=create_initializer(initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.compat.v1.variable_scope("output"): layer_output = tf.compat.v1.layers.dense( intermediate_output, hidden_size, kernel_initializer=create_initializer(initializer_range)) layer_output = dropout(layer_output, hidden_dropout_prob) layer_output = layer_norm(layer_output + attention_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: final_outputs = [] for layer_output in all_layer_outputs: final_output = reshape_from_matrix(layer_output, input_shape) final_outputs.append(final_output) return final_outputs else: final_output = reshape_from_matrix(prev_output, input_shape) return final_output
def conv_1d(inputs: tf.Tensor, filter_width: int, stride: int, activation: Optional[str], activation_noise: float, dropout_keep_rate: tf.Tensor, use_dropout: bool, name: str) -> tf.Tensor: """ Performs a 1d convolution over the given inputs. Args: inputs: A [B, T, D] tensor of features (D) for each seq element (T) and batch sample (B) filter_width: The width of the convolution filter. Must be at least one. stride: The convolution stride. Must be at least one. activation: The name of the activation function. If none, then we apply a linear activation. activation_noise: The noise to apply to the final activations. dropout_keep_rate: The dropout keep rate to apply to the transformed representation. use_dropout: Whether to apply dropout. name: The name of this layer. Returns: A [B, T, D] tensor that is the result of applying the 1d convolution filter to the inputs. """ assert filter_width >= 1, 'Must have a filter width of at least one. Got: {0}'.format( filter_width) assert stride >= 1, 'Must have a stride length of at least one. Got: {0}'.format( stride) with tf.variable_scope(name): # Create the (trainable) convolution filter num_features = inputs.get_shape()[-1] # D conv_filter = tf.get_variable( shape=[filter_width, num_features, num_features], initializer=tf.glorot_uniform_initializer(), name='filter', dtype=tf.float32) # Create the (trainable) bias bias = tf.get_variable(shape=[1, 1, num_features], initializer=tf.random_uniform_initializer( minval=-0.7, maxval=0.7), name='bias', dtype=tf.float32) # Apply the convolution filter, [B, T, D] transformed = tf.nn.conv1d(value=inputs, filters=conv_filter, stride=stride, padding='SAME', data_format='NWC') transformed = transformed + bias # [B, T, D] # Apply the activation function, [B, T, D] activation_fn = get_activation(activation) if activation_fn is not None: transformed = activation_fn(transformed) # Apply the activation noise transformed = apply_noise(transformed, scale=activation_noise) # Apply dropout if specified, [B, T, D] if use_dropout: transformed = tf.nn.dropout(transformed, keep_prob=dropout_keep_rate) return transformed
def _complex_model(self, is_train: bool = False) -> tf.Tensor: models = ['nbow', 'rnn'] # nbow, cnn, rnn, bert attention = False embeddings = list() with tf.variable_scope("tree_encoder"): self._make_placeholders() self.placeholders['tokens_lengths'] = \ tf.placeholder(tf.int32, shape=[None], name='tokens_lengths') self.placeholders['rnn_dropout_keep_rate'] = \ tf.placeholder(tf.float32, shape=[], name='rnn_dropout_keep_rate') self.placeholders['rnn_recurrent_dropout_keep_rate'] = \ tf.placeholder(tf.float32, shape=[], name='rnn_recurrent_dropout_keep_rate') common_flag = True if 'nbow' in models and 'rnn' in models: seq_tokens = self.placeholders['tokens'] seq_tokens_embeddings = self.embedding_layer(seq_tokens) common_flag = False if 'nbow' in models: if common_flag: seq_tokens_embeddings = self.embedding_layer( self.placeholders['tokens']) seq_token_mask = self.placeholders['tokens_mask'] seq_token_lengths = tf.reduce_sum(seq_token_mask, axis=1) # B embedding = pool_sequence_embedding( self.get_hyper('nbow_pool_mode').lower(), sequence_token_embeddings=seq_tokens_embeddings, sequence_lengths=seq_token_lengths, sequence_token_masks=seq_token_mask) embeddings.append(embedding) if 'cnn' in models: if common_flag: seq_tokens_embeddings = self.embedding_layer( self.placeholders['tokens']) seq_tokens_embeddings = self.__add_position_encoding( seq_tokens_embeddings) activation_fun = get_activation( self.get_hyper('1dcnn_activation')) current_embeddings = seq_tokens_embeddings num_filters_and_width = zip( self.get_hyper('1dcnn_layer_list'), self.get_hyper('1dcnn_kernel_width')) for (layer_idx, (num_filters, kernel_width)) in enumerate(num_filters_and_width): next_embeddings = tf.layers.conv1d( inputs=current_embeddings, filters=num_filters, kernel_size=kernel_width, padding="same") # Add residual connections past the first layer. if self.get_hyper('1dcnn_add_residual_connections' ) and layer_idx > 0: next_embeddings += current_embeddings current_embeddings = activation_fun(next_embeddings) current_embeddings = tf.nn.dropout( current_embeddings, keep_prob=self.placeholders['dropout_keep_rate']) seq_token_mask = self.placeholders['tokens_mask'] seq_token_lengths = tf.reduce_sum(seq_token_mask, axis=1) # B embedding = pool_sequence_embedding( self.get_hyper('1dcnn_pool_mode').lower(), sequence_token_embeddings=current_embeddings, sequence_lengths=seq_token_lengths, sequence_token_masks=seq_token_mask) embeddings.append(embedding) if 'rnn' in models: if common_flag: seq_tokens = self.placeholders['tokens'] seq_tokens_embeddings = self.embedding_layer(seq_tokens) seq_tokens_lengths = self.placeholders['tokens_lengths'] rnn_final_state, token_embeddings = self._encode_with_rnn( seq_tokens_embeddings, seq_tokens_lengths) output_pool_mode = self.get_hyper('rnn_pool_mode').lower() if output_pool_mode == 'rnn_final': embedding = rnn_final_state else: token_mask = tf.expand_dims(tf.range( tf.shape(seq_tokens)[1]), axis=0) # 1 x T token_mask = tf.tile( token_mask, multiples=(tf.shape(seq_tokens_lengths)[0], 1)) # B x T token_mask = tf.cast(token_mask < tf.expand_dims( seq_tokens_lengths, axis=-1), dtype=tf.float32) # B x T embedding = pool_sequence_embedding( output_pool_mode, sequence_token_embeddings=token_embeddings, sequence_lengths=seq_tokens_lengths, sequence_token_masks=token_mask) embeddings.append(embedding) if 'bert' in models: config = BertConfig( vocab_size=self.get_hyper('token_vocab_size'), hidden_size=self.get_hyper('self_attention_hidden_size'), num_hidden_layers=self.get_hyper( 'self_attention_num_layers'), num_attention_heads=self.get_hyper( 'self_attention_num_heads'), intermediate_size=self.get_hyper( 'self_attention_intermediate_size')) model = BertModel(config=config, is_training=is_train, input_ids=self.placeholders['tokens'], input_mask=self.placeholders['tokens_mask'], use_one_hot_embeddings=False) output_pool_mode = self.get_hyper( 'self_attention_pool_mode').lower() if output_pool_mode == 'bert': embedding = model.get_pooled_output() else: seq_token_embeddings = model.get_sequence_output() seq_token_masks = self.placeholders['tokens_mask'] seq_token_lengths = tf.reduce_sum(seq_token_masks, axis=1) # B embedding = pool_sequence_embedding( output_pool_mode, sequence_token_embeddings=seq_token_embeddings, sequence_lengths=seq_token_lengths, sequence_token_masks=seq_token_masks) embeddings.append(embedding) embeddings = tf.concat(embeddings, axis=-1) if attention: embeddings = Common.self_attention_layer(embeddings) # "concat one-hot" is equal to "accumulate embedding" # [v1^T, v2^T, v3^T] * W = [v1^T, v2^T, v3^T]*[w1, w2, w3]^T = v1^T*w1+v2^T*w2+v3^T*w3 print('*@' * 16) print(embeddings) print(tf.shape(embeddings)) return tf.reduce_sum(embeddings, axis=0)
def _single_model(self, is_train: bool = False) -> tf.Tensor: model = 'nbow' # nbow, cnn, rnn, bert attention = False embedding = None with tf.variable_scope("tree_encoder"): self._make_placeholders() self.placeholders['tokens_lengths'] = \ tf.placeholder(tf.int32, shape=[None], name='tokens_lengths') self.placeholders['rnn_dropout_keep_rate'] = \ tf.placeholder(tf.float32, shape=[], name='rnn_dropout_keep_rate') self.placeholders['rnn_recurrent_dropout_keep_rate'] = \ tf.placeholder(tf.float32, shape=[], name='rnn_recurrent_dropout_keep_rate') if model == 'nbow': seq_tokens_embeddings = self.embedding_layer( self.placeholders['tokens']) seq_token_mask = self.placeholders['tokens_mask'] seq_token_lengths = tf.reduce_sum(seq_token_mask, axis=1) # B if attention: embedding = Common.yet_attention_layer( seq_tokens_embeddings) else: embedding = pool_sequence_embedding( self.get_hyper('nbow_pool_mode').lower(), sequence_token_embeddings=seq_tokens_embeddings, sequence_lengths=seq_token_lengths, sequence_token_masks=seq_token_mask) elif model == 'cnn': seq_tokens_embeddings = self.embedding_layer( self.placeholders['tokens']) seq_tokens_embeddings = self.__add_position_encoding( seq_tokens_embeddings) activation_fun = get_activation( self.get_hyper('1dcnn_activation')) current_embeddings = seq_tokens_embeddings num_filters_and_width = zip( self.get_hyper('1dcnn_layer_list'), self.get_hyper('1dcnn_kernel_width')) for (layer_idx, (num_filters, kernel_width)) in enumerate(num_filters_and_width): next_embeddings = tf.layers.conv1d( inputs=current_embeddings, filters=num_filters, kernel_size=kernel_width, padding="same") # Add residual connections past the first layer. if self.get_hyper('1dcnn_add_residual_connections' ) and layer_idx > 0: next_embeddings += current_embeddings current_embeddings = activation_fun(next_embeddings) current_embeddings = tf.nn.dropout( current_embeddings, keep_prob=self.placeholders['dropout_keep_rate']) if attention: embedding = Common.yet_attention_layer(current_embeddings) else: seq_token_mask = self.placeholders['tokens_mask'] seq_token_lengths = tf.reduce_sum(seq_token_mask, axis=1) # B embedding = pool_sequence_embedding( self.get_hyper('1dcnn_pool_mode').lower(), sequence_token_embeddings=current_embeddings, sequence_lengths=seq_token_lengths, sequence_token_masks=seq_token_mask) elif model == 'rnn': seq_tokens = self.placeholders['tokens'] seq_tokens_embeddings = self.embedding_layer(seq_tokens) seq_tokens_lengths = self.placeholders['tokens_lengths'] rnn_final_state, token_embeddings = self._encode_with_rnn( seq_tokens_embeddings, seq_tokens_lengths) output_pool_mode = self.get_hyper('rnn_pool_mode').lower() if output_pool_mode == 'rnn_final': embedding = rnn_final_state else: if attention: embedding = Common.yet_attention_layer( token_embeddings) else: token_mask = tf.expand_dims(tf.range( tf.shape(seq_tokens)[1]), axis=0) # 1 x T token_mask = tf.tile( token_mask, multiples=(tf.shape(seq_tokens_lengths)[0], 1)) # B x T token_mask = tf.cast(token_mask < tf.expand_dims( seq_tokens_lengths, axis=-1), dtype=tf.float32) # B x T embedding = pool_sequence_embedding( output_pool_mode, sequence_token_embeddings=token_embeddings, sequence_lengths=seq_tokens_lengths, sequence_token_masks=token_mask) elif model == 'bert': config = BertConfig( vocab_size=self.get_hyper('token_vocab_size'), hidden_size=self.get_hyper('self_attention_hidden_size'), num_hidden_layers=self.get_hyper( 'self_attention_num_layers'), num_attention_heads=self.get_hyper( 'self_attention_num_heads'), intermediate_size=self.get_hyper( 'self_attention_intermediate_size')) model = BertModel(config=config, is_training=is_train, input_ids=self.placeholders['tokens'], input_mask=self.placeholders['tokens_mask'], use_one_hot_embeddings=False) output_pool_mode = self.get_hyper( 'self_attention_pool_mode').lower() if output_pool_mode == 'bert': embedding = model.get_pooled_output() else: seq_token_embeddings = model.get_sequence_output() # only when it is not pooled out, then we consider attention if attention: embedding = Common.yet_attention_layer( seq_token_embeddings) else: seq_token_masks = self.placeholders['tokens_mask'] seq_token_lengths = tf.reduce_sum(seq_token_masks, axis=1) # B embedding = pool_sequence_embedding( output_pool_mode, sequence_token_embeddings=seq_token_embeddings, sequence_lengths=seq_token_lengths, sequence_token_masks=seq_token_masks) else: raise ValueError('Undefined Config') return embedding