def __init__(self, xlnet_config, is_training, input_ids, seg_ids, input_mask, mems, perm_mask, target, target_mask, target_mapping, inp_q, sample_weight=None, **kwargs): super().__init__() run_config = XLNetRunConfig( is_training=is_training, bi_data=True, use_tpu=False, use_bfloat16=False, dropout=(0.1 if is_training else 0.0), dropatt=(0.1 if is_training else 0.0), init='normal', init_range=0.1, init_std=0.02, clamp_len=-1) model = XLNetEncoder( xlnet_config=xlnet_config, is_training=is_training, input_ids=input_ids, seg_ids=seg_ids, input_mask=input_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, inp_q=inp_q, **kwargs) with tf.variable_scope('model', reuse=tf.AUTO_REUSE): per_example_loss, preds = lm_loss( hidden=model.get_sequence_output(), target=target, n_token=xlnet_config.n_token, d_model=xlnet_config.d_model, initializer=model.get_initializer(), lookup_table=model.get_embedding_table(), tie_weight=True, bi_data=run_config.bi_data, use_tpu=run_config.use_tpu) if sample_weight is not None: sample_weight = tf.expand_dims( tf.cast(sample_weight, dtype=tf.float32), axis=-1) per_example_loss *= sample_weight self.total_loss = tf.reduce_sum( per_example_loss * target_mask) / tf.reduce_sum(target_mask) self.losses['PLM'] = per_example_loss * target_mask self.preds['PLM'] = preds self.preds['PLM_mask'] = target_mask
def __init__(self, bert_config, is_training, sketchy_encoder, intensive_encoder, query_mask, label_ids, has_answer, sample_weight=None, scope='retro_reader', matching_mechanism='cross-attention', beta_1=0.5, beta_2=0.5, threshold=1.0, trainable=True, **kwargs): super().__init__(**kwargs) # verifier with tf.variable_scope(scope): # sketchy reading module with tf.variable_scope('sketchy/prediction'): sketchy_output = sketchy_encoder.get_pooled_output() hidden_size = sketchy_output.shape.as_list()[-1] output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( sketchy_output, bert_config.hidden_dropout_prob \ if is_training else 0.0) logits = tf.matmul( output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot( has_answer, depth=2, dtype=tf.float32) per_example_loss = - tf.reduce_sum( one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast( sample_weight, dtype=tf.float32) * per_example_loss self.losses['sketchy_losses'] = per_example_loss sketchy_loss = tf.reduce_mean(per_example_loss) score_ext = logits[:, 1] - logits[:, 0] # intensive reading module with tf.variable_scope('intensive'): H = intensive_encoder.get_sequence_output() H_Q = H * tf.cast( tf.expand_dims(query_mask, axis=-1), tf.float32) (batch_size, max_seq_length, hidden_size) = \ util.get_shape_list(H) # cross-attention if matching_mechanism == 'cross-attention': with tf.variable_scope('cross_attention'): attention_mask = \ self.create_attention_mask_from_input_mask( query_mask, batch_size, max_seq_length) (H_prime, _) = self.attention_layer( from_tensor=H, to_tensor=H_Q, attention_mask=attention_mask, num_attention_heads=\ bert_config.num_attention_heads, size_per_head=\ hidden_size // bert_config.num_attention_heads, attention_probs_dropout_prob=\ bert_config.hidden_dropout_prob, initializer_range=bert_config.initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, trainable=trainable) # matching-attention elif matching_mechanism == 'matching-attention': with tf.variable_scope('matching_attention'): output_weights = tf.get_variable( 'output_weights', shape=[hidden_size, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[hidden_size], initializer=tf.zeros_initializer(), trainable=trainable) trans = tf.matmul( H_Q, tf.tile( tf.expand_dims(output_weights, axis=0), [batch_size, 1, 1]), transpose_b=True) trans = tf.nn.bias_add(trans, output_bias) M = tf.nn.softmax( tf.matmul(H, trans, transpose_b=True), axis=-1) H_prime = tf.matmul(M, H_Q) with tf.variable_scope('prediction'): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( H_prime, bert_config.hidden_dropout_prob \ if is_training else 0.0) output_layer = tf.reshape( output_layer, [batch_size * max_seq_length, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape( logits, [batch_size, max_seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs['mrc_probs'] = probs self.preds['mrc_preds'] = tf.argmax(logits, axis=-1) start_one_hot_labels = tf.one_hot( label_ids[:, 0], depth=max_seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot( label_ids[:, 1], depth=max_seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( - 0.5 * tf.reduce_sum( start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum( end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight intensive_loss = tf.reduce_mean(per_example_loss) self.losses['intensive_losses'] = per_example_loss score_has = tf.norm( probs[:, 0, 1:] + probs[:, 1, 1:], np.inf, axis=-1) score_null = probs[:, 0, 0] + probs[:, 1, 0] score_diff = score_has - score_null # rear verification v = beta_1 * score_diff + beta_2 * score_ext self.preds['verifier_preds'] = \ tf.cast(tf.greater(v, threshold), tf.int32) self.probs['verifier_probs'] = v self.total_loss = sketchy_loss + intensive_loss
def __init__(self, vocab_size, is_training, input_ids, input_mask, segment_ids, sample_weight=None, reduced_size=64, topic_size=1024, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, bias=0, scope='vae', trainable=True, **kwargs): super().__init__() # freeze parameters config = Config(vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding = kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): with tf.variable_scope('embeddings'): (self.embedding_output, self.embedding_table) = \ self.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, batch_size=batch_size, max_seq_length=seq_length, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name='word_embeddings', tilda_embeddings=tilda_embeddings, trainable=trainable) self.embedding_output = self.embedding_postprocessor( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=seq_length, hidden_size=config.hidden_size, use_token_type=True, segment_ids=segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, trainable=trainable) with tf.variable_scope('encoder'): # stacked transformer attention_mask = self.create_attention_mask_from_input_mask( input_mask, batch_size, seq_length) self.all_encoder_layers = self.transformer_model( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=seq_length, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=util.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=\ config.attention_probs_dropout_prob, initializer_range=config.initializer_range, trainable=trainable) # projection with tf.variable_scope('projection'): transformer_output = tf.layers.dense( self.all_encoder_layers[-1], reduced_size, activation=util.gelu, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), trainable=trainable) transformer_output = tf.reshape(transformer_output, [batch_size, -1]) input_length = tf.reduce_sum(input_mask, axis=-1) input_length = tf.cast(input_length, tf.float32) input_length_1d = tf.reshape(input_length, [batch_size]) input_length_2d = tf.reshape(input_length, [batch_size, 1]) broadcast_mask = tf.sequence_mask( tf.multiply(input_length_1d, reduced_size), seq_length * reduced_size, dtype=tf.float32) broadcast_mask = tf.multiply(broadcast_mask, seq_length / input_length_2d) transformer_output *= broadcast_mask # latent space miu = tf.layers.dense( transformer_output, topic_size, activation='tanh', kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), name='miu', trainable=trainable) sigma = tf.layers.dense( transformer_output, topic_size, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), name='sigma', trainable=trainable) self.probs['miu'] = miu self.probs['sigma'] = sigma with tf.variable_scope('decoder'): with tf.variable_scope('projection'): # reparametarization if is_training: noise = tf.random_normal([batch_size, topic_size]) else: noise = tf.random_uniform([batch_size, topic_size], minval=-bias, maxval=bias) decoder_input = miu + tf.exp(sigma) * noise # projection decoder_input = tf.layers.dense( decoder_input, seq_length * reduced_size, activation=util.gelu, kernel_initializer=tf.truncated_normal_initializer( stddev=config.initializer_range), trainable=trainable) intermediate_input = tf.reshape( decoder_input, [-1, seq_length, reduced_size]) intermediate_input = util.layer_norm(intermediate_input, trainable=trainable) intermediate_input = util.dropout( intermediate_input, config.hidden_dropout_prob) # MLP with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( intermediate_input, 4 * reduced_size, activation=util.gelu, kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) with tf.variable_scope('output'): decoder_output = tf.layers.dense( intermediate_output, config.hidden_size, kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) decoder_output = util.layer_norm(decoder_output, trainable=trainable) decoder_output = util.dropout(decoder_output, config.hidden_dropout_prob) self.all_decoder_layers = [intermediate_output, decoder_output] self.all_decoder_layers = [decoder_output] # reconstruction with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( decoder_output, units=config.hidden_size, activation=util.get_activation(config.hidden_act), kernel_initializer=util.create_initializer( config.initializer_range), trainable=trainable) input_tensor = util.layer_norm(input_tensor, trainable=trainable) output_weights = self.embedding_table output_bias = tf.get_variable('output_bias', shape=[config.vocab_size], initializer=tf.zeros_initializer(), trainable=trainable) flatten_input_tensor = tf.reshape(input_tensor, [-1, config.hidden_size]) logits = tf.matmul(flatten_input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, config.vocab_size]) probs = tf.nn.softmax(logits, axis=-1, name='probs') lm_log_probs = tf.nn.log_softmax(logits, axis=-1) self.preds['preds'] = tf.argmax(probs, axis=-1) one_hot_labels = tf.one_hot(input_ids, depth=config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(lm_log_probs * one_hot_labels, axis=[-1]) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) self.total_loss = (tf.reduce_mean(per_example_loss) + tf.reduce_mean(tf.square(miu)) + tf.reduce_mean(tf.exp(sigma) - sigma - 1)) self.losses['losses'] = per_example_loss
def __init__(self, bert_config, is_training, input_ids, input_mask, segment_ids, scope='bert', drop_pooler=False, trainable=True, **kwargs): bert_config = copy.deepcopy(bert_config) if not is_training: bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] max_seq_length = input_shape[1] # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding = kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): with tf.variable_scope('embeddings'): (self.embedding_output, self.embedding_table) = \ self.embedding_lookup( input_ids=input_ids, vocab_size=bert_config.vocab_size, batch_size=batch_size, max_seq_length=max_seq_length, embedding_size=bert_config.hidden_size, initializer_range=bert_config.initializer_range, word_embedding_name='word_embeddings', tilda_embeddings=tilda_embeddings, trainable=trainable) # Add positional embeddings and token type embeddings # layer normalize and perform dropout. self.embedding_output = self.embedding_postprocessor( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=max_seq_length, hidden_size=bert_config.hidden_size, use_token_type=True, segment_ids=segment_ids, token_type_vocab_size=bert_config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=bert_config.initializer_range, max_position_embeddings=\ bert_config.max_position_embeddings, dropout_prob=bert_config.hidden_dropout_prob, trainable=trainable) with tf.variable_scope('encoder'): attention_mask = self.create_attention_mask_from_input_mask( input_mask, batch_size, max_seq_length) # stacked transformers self.all_encoder_layers = self.transformer_model( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=max_seq_length, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=util.get_activation( bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=\ bert_config.attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, trainable=trainable) self.sequence_output = self.all_encoder_layers[-1] with tf.variable_scope('pooler'): first_token_tensor = self.sequence_output[:, 0, :] # trick: ignore the fully connected layer if drop_pooler: self.pooled_output = first_token_tensor else: self.pooled_output = tf.layers.dense( first_token_tensor, bert_config.hidden_size, activation=tf.tanh, kernel_initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable)
def _build_forward(layer_input): with tf.variable_scope('attention'): attention_heads = [] with tf.variable_scope('self'): (attention_head, attention_scores) = \ self.attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=\ attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, dtype=dtype, trainable=trainable) attention_heads.append(attention_head) self.attention_scores.append(attention_scores) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: attention_output = tf.concat(attention_heads, axis=-1) with tf.variable_scope('output'): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=trainable) attention_output = util.dropout( attention_output, hidden_dropout_prob) attention_output = util.layer_norm( attention_output + layer_input, trainable=trainable) # The activation is only applied to the `intermediate` # hidden layer. with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=util.create_initializer( initializer_range), trainable=trainable) # Down-project back to hidden_size then add the residual. with tf.variable_scope('output'): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=trainable) layer_output = util.dropout(layer_output, hidden_dropout_prob) layer_output = util.layer_norm(layer_output + attention_output, trainable=trainable) return layer_output
def transformer_model(input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=util.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): '''Multi-headed, multi-layer Transformer from 'Attention is All You Need'. This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/ tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the 'intermediate' (a.k.a., feed forward) layer. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. ''' if hidden_size % num_attention_heads != 0: raise ValueError( 'The hidden size (%d) is not a multiple of the number of attention ' 'heads (%d)' % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) input_shape = util.get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] input_width = input_shape[2] # The Transformer performs sum residuals on all layers so the input needs # to be the same as the hidden size. if input_width != hidden_size: raise ValueError( 'The width of the input tensor (%d) != hidden size (%d)' % (input_width, hidden_size)) # We keep the representation as a 2D tensor to avoid re-shaping it back and # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on # the GPU/CPU but may not be free on the TPU, so we want to minimize them to # help the optimizer. prev_output = util.reshape_to_matrix(input_tensor) attn_maps = [] all_layer_outputs = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope('layer_%d' % layer_idx): with tf.variable_scope('attention'): attention_heads = [] with tf.variable_scope('self'): attention_head, probs = attention_layer( from_tensor=prev_output, to_tensor=prev_output, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob= attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_heads.append(attention_head) attn_maps.append(probs) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope('output'): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range)) attention_output = util.dropout(attention_output, hidden_dropout_prob) attention_output = util.layer_norm(attention_output + prev_output) # The activation is only applied to the 'intermediate' hidden layer. with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=util.create_initializer( initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope('output'): prev_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range)) prev_output = util.dropout(prev_output, hidden_dropout_prob) prev_output = util.layer_norm(prev_output + attention_output) all_layer_outputs.append(prev_output) attn_maps = tf.stack(attn_maps, 0) if do_return_all_layers: return tf.stack([ util.reshape_from_matrix(layer, input_shape) for layer in all_layer_outputs ], 0), attn_maps else: return util.reshape_from_matrix(prev_output, input_shape), attn_maps
def __init__(self, is_training, input_tensor, n_wide_features, wide_features, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) hidden_size = input_tensor.shape.as_list()[-1] feature_size = wide_features.shape.as_list()[-1] with tf.variable_scope('wide'): feature_embeddings = tf.get_variable( name='feature_embeddings', shape=[feature_size + 1, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) wide_output = tf.gather(feature_embeddings, wide_features) # [B, N, H] with tf.variable_scope('wide_and_deep'): deep_output = tf.expand_dims(input_tensor, -1) # [B, H, 1] attention_scores = tf.matmul(wide_output, deep_output) # [B, N, 1] attention_scores = tf.transpose(attention_scores, [0, 2, 1]) # [B, 1, N] attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(hidden_size)) feature_mask = tf.cast( tf.sequence_mask(n_wide_features, feature_size), tf.float32) # [B, N] feature_mask = tf.expand_dims(feature_mask, 1) # [B, 1, N] attention_scores += (1.0 - feature_mask) * -10000.0 attention_matrix = tf.nn.softmax(attention_scores, axis=-1) attention_output = tf.matmul(attention_matrix, wide_output) # [B, 1, H] attention_output = attention_output[:, 0, :] # [B, H] # attention_output = util.dropout( # attention_output, hidden_dropout_prob) input_tensor = util.layer_norm(attention_output + input_tensor, trainable=trainable) with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.preds['preds'] = tf.argmax(logits, axis=-1) self.probs['probs'] = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = tf.cast(sample_weight, dtype=tf.float32) * per_example_loss thresh = kwargs.get('tsa_thresh') if thresh is not None: assert isinstance( thresh, float), ('`tsa_thresh` must be a float between 0 and 1.') uncertainty = tf.reduce_sum(self.probs['probs'] * tf.log(self.probs['probs']), axis=-1) uncertainty /= tf.log(1 / label_size) per_example_loss = tf.cast( tf.greater(uncertainty, thresh), dtype=tf.float32) * \ per_example_loss self.losses['losses'] = per_example_loss self.total_loss = tf.reduce_mean(per_example_loss)
def attention_ffn_block(layer_input, hidden_size=768, attention_mask=None, num_attention_heads=1, attention_head_size=64, attention_probs_dropout_prob=0.0, intermediate_size=3072, intermediate_act_fn=None, initializer_range=0.02, hidden_dropout_prob=0.0, use_einsum=True): """A network with attention-ffn as sub-block. Args: layer_input: float Tensor of shape [batch_size, from_seq_length, from_width]. hidden_size: (optional) int, size of hidden layer. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. attention_head_size: int. Size of attention head. attention_probs_dropout_prob: float. dropout probability for attention_layer intermediate_size: int. Size of intermediate hidden layer. intermediate_act_fn: (optional) Activation function for the intermediate layer. initializer_range: float. Range of the weight initializer. hidden_dropout_prob: (optional) float. Dropout probability of the hidden layer. use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers Returns: layer output """ with tf.variable_scope("attention_1"): with tf.variable_scope("self"): attention_output = attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, use_einsum=use_einsum) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = dense_layer_3d_proj( attention_output, hidden_size, attention_head_size, util.create_initializer(initializer_range), None, use_einsum=use_einsum, name="dense") attention_output = util.dropout(attention_output, hidden_dropout_prob) attention_output = util.layer_norm(attention_output + layer_input) with tf.variable_scope("ffn_1"): with tf.variable_scope("intermediate"): intermediate_output = dense_layer_2d( attention_output, intermediate_size, util.create_initializer(initializer_range), intermediate_act_fn, use_einsum=use_einsum, num_attention_heads=num_attention_heads, name="dense") with tf.variable_scope("output"): ffn_output = dense_layer_2d( intermediate_output, hidden_size, util.create_initializer(initializer_range), None, use_einsum=use_einsum, num_attention_heads=num_attention_heads, name="dense") ffn_output = util.dropout(ffn_output, hidden_dropout_prob) ffn_output = util.layer_norm(ffn_output + attention_output) return ffn_output
def transformer_model(input_tensor, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_hidden_groups=12, num_attention_heads=12, intermediate_size=3072, inner_group_num=1, intermediate_act_fn="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False, use_einsum=True, trainable=True): """Multi-headed, multi-layer Transformer from "Attention is All You Need". This is almost an exact implementation of the original Transformer encoder. See the original paper: https://arxiv.org/abs/1706.03762 Also see: https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py Args: input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length], with 1 for positions that can be attended to and 0 in positions that should not be. hidden_size: int. Hidden size of the Transformer. num_hidden_layers: int. Number of layers (blocks) in the Transformer. num_hidden_groups: int. Number of group for the hidden layers, parameters in the same group are shared. num_attention_heads: int. Number of attention heads in the Transformer. intermediate_size: int. The size of the "intermediate" (a.k.a., feed forward) layer. inner_group_num: int, number of inner repetition of attention and ffn. intermediate_act_fn: function. The non-linear activation function to apply to the output of the intermediate/feed-forward layer. hidden_dropout_prob: float. Dropout probability for the hidden layers. attention_probs_dropout_prob: float. Dropout probability of the attention probabilities. initializer_range: float. Range of the initializer (stddev of truncated normal). do_return_all_layers: Whether to also return all layers or just the final layer. use_einsum: bool. Whether to use einsum or reshape+matmul for dense layers Returns: float Tensor of shape [batch_size, seq_length, hidden_size], the final hidden layer of the Transformer. Raises: ValueError: A Tensor shape or parameter is invalid. """ if hidden_size % num_attention_heads != 0: raise ValueError( "The hidden size (%d) is not a multiple of the number of attention " "heads (%d)" % (hidden_size, num_attention_heads)) attention_head_size = hidden_size // num_attention_heads input_shape = util.get_shape_list(input_tensor, expected_rank=3) input_width = input_shape[2] all_layer_outputs = [] if input_width != hidden_size: prev_output = dense_layer_2d( input_tensor, hidden_size, util.create_initializer(initializer_range), None, use_einsum=use_einsum, name="embedding_hidden_mapping_in") else: prev_output = input_tensor with tf.variable_scope("transformer", reuse=tf.AUTO_REUSE): for layer_idx in range(num_hidden_layers): group_idx = int(layer_idx / num_hidden_layers * num_hidden_groups) with tf.variable_scope("group_%d" % group_idx): with tf.name_scope("layer_%d" % layer_idx): layer_output = prev_output for inner_group_idx in range(inner_group_num): with tf.variable_scope("inner_group_%d" % inner_group_idx): layer_output = attention_ffn_block( layer_input=layer_output, hidden_size=hidden_size, attention_mask=attention_mask, num_attention_heads=num_attention_heads, attention_head_size=attention_head_size, attention_probs_dropout_prob= attention_probs_dropout_prob, intermediate_size=intermediate_size, intermediate_act_fn=intermediate_act_fn, initializer_range=initializer_range, hidden_dropout_prob=hidden_dropout_prob, use_einsum=use_einsum) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: return all_layer_outputs else: return all_layer_outputs[-1]
def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, d_head, d_inner, dropout, dropatt, attn_type, bi_data, initializer, is_training, mem_len=None, inp_q=None, mems=None, same_length=False, clamp_len=-1, untie_r=False, use_tpu=True, input_mask=None, perm_mask=None, seg_id=None, reuse_len=None, ff_activation='relu', target_mapping=None, use_bfloat16=False, scope='transformer', tilda_embeddings=None, **kwargs): ''' Defines a Transformer-XL computation graph with additional support for XLNet. Args: inp_k: int32 Tensor in shape [len, bsz], the input token IDs. seg_id: int32 Tensor in shape [len, bsz], the input segment IDs. input_mask: float32 Tensor in shape [len, bsz], the input mask. 0 for real tokens and 1 for padding. mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory from previous batches. The length of the list equals n_layer. If None, no memory is used. perm_mask: float32 Tensor in shape [len, len, bsz]. If perm_mask[i, j, k] = 0, i attend to j in batch k; if perm_mask[i, j, k] = 1, i does not attend to j in batch k. If None, each position attends to all the others. target_mapping: float32 Tensor in shape [num_predict, len, bsz]. If target_mapping[i, j, k] = 1, the i-th predict in batch k is on the j-th token. Only used during pretraining for partial prediction. Set to None during finetuning. inp_q: float32 Tensor in shape [len, bsz]. 1 for tokens with losses and 0 for tokens without losses. Only used during pretraining for two-stream attention. Set to None during finetuning. n_layer: int, the number of layers. d_model: int, the hidden size. n_head: int, the number of attention heads. d_head: int, the dimension size of each attention head. d_inner: int, the hidden size in feed-forward layers. ff_activation: str, 'relu' or 'gelu'. untie_r: bool, whether to untie the biases in attention. n_token: int, the vocab size. is_training: bool, whether in training mode. use_tpu: bool, whether TPUs are used. use_bfloat16: bool, use bfloat16 instead of float32. dropout: float, dropout rate. dropatt: float, dropout rate on attention probabilities. init: str, the initialization scheme, either 'normal' or 'uniform'. init_range: float, initialize the parameters with a uniform distribution in [-init_range, init_range]. Only effective when init='uniform'. init_std: float, initialize the parameters with a normal distribution with mean 0 and stddev init_std. Only effective when init='normal'. mem_len: int, the number of tokens to cache. reuse_len: int, the number of tokens in the currect batch to be cached and reused in the future. bi_data: bool, whether to use bidirectional input pipeline. Usually set to True during pretraining and False during finetuning. clamp_len: int, clamp all relative distances larger than clamp_len. -1 means no clamping. same_length: bool, whether to use the same attention length for each token. summary_type: str, 'last', 'first', 'mean', or 'attn'. The method to pool the input to get a vector representation. initializer: A tf initializer. scope: scope name for the computation graph. ''' tf_float = tf.bfloat16 if use_bfloat16 else tf.float32 new_mems = [] with tf.variable_scope(scope): if untie_r: r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head], dtype=tf_float, initializer=initializer) r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head], dtype=tf_float, initializer=initializer) else: r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head], dtype=tf_float, initializer=initializer) r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head], dtype=tf_float, initializer=initializer) bsz = tf.shape(inp_k)[1] qlen = tf.shape(inp_k)[0] mlen = tf.shape(mems[0])[0] if mems is not None else 0 klen = mlen + qlen ##### Attention mask # causal attention mask if attn_type == 'uni': attn_mask = _create_mask(qlen, mlen, tf_float, same_length) attn_mask = attn_mask[:, :, None, None] elif attn_type == 'bi': attn_mask = None else: raise ValueError('Unsupported attention type: %s' % attn_type) # data mask: input mask & perm mask if input_mask is not None and perm_mask is not None: data_mask = input_mask[None] + perm_mask elif input_mask is not None and perm_mask is None: data_mask = input_mask[None] elif input_mask is None and perm_mask is not None: data_mask = perm_mask else: data_mask = None if data_mask is not None: # all mems can be attended to mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz], dtype=tf_float) data_mask = tf.cast(data_mask, dtype=tf.float32) data_mask = tf.concat([mems_mask, data_mask], 1) if attn_mask is None: attn_mask = data_mask[:, :, :, None] else: attn_mask += data_mask[:, :, :, None] if attn_mask is not None: attn_mask = tf.cast(attn_mask > 0, dtype=tf_float) if attn_mask is not None: non_tgt_mask = -tf.eye(qlen, dtype=tf_float) non_tgt_mask = tf.concat( [tf.zeros([qlen, mlen], dtype=tf_float), non_tgt_mask], axis=-1) non_tgt_mask = tf.cast( (attn_mask + non_tgt_mask[:, :, None, None]) > 0, dtype=tf_float) else: non_tgt_mask = None ##### Word embedding word_emb_k, lookup_table = embedding_lookup( x=inp_k, n_token=n_token, d_embed=d_model, initializer=initializer, use_tpu=use_tpu, dtype=tf_float, scope='word_embedding', tilda_embeddings=tilda_embeddings) if inp_q is not None: with tf.variable_scope('mask_emb'): mask_emb = tf.get_variable('mask_emb', [1, 1, d_model], dtype=tf_float) if target_mapping is not None: word_emb_q = tf.tile(mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) else: inp_q_ext = inp_q[:, :, None] word_emb_q = \ inp_q_ext * mask_emb + (1 - inp_q_ext) * word_emb_k output_h = tf.layers.dropout(word_emb_k, dropout, training=is_training) if inp_q is not None: output_g = tf.layers.dropout(word_emb_q, dropout, training=is_training) ##### Segment embedding if seg_id is not None: if untie_r: r_s_bias = tf.get_variable('r_s_bias', [n_layer, n_head, d_head], dtype=tf_float, initializer=initializer) else: # default case (tie) r_s_bias = tf.get_variable('r_s_bias', [n_head, d_head], dtype=tf_float, initializer=initializer) seg_embed = tf.get_variable('seg_embed', [n_layer, 2, n_head, d_head], dtype=tf_float, initializer=initializer) # Convert `seg_id` to one-hot `seg_mat` mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32) cat_ids = tf.concat([mem_pad, seg_id], 0) # `1` indicates not in the same segment [qlen x klen x bsz] seg_mat = tf.cast( tf.logical_not(tf.equal(seg_id[:, None], cat_ids[None, :])), tf.int32) seg_mat = tf.one_hot(seg_mat, 2, dtype=tf_float) else: seg_mat = None ##### Positional encoding pos_emb = relative_positional_encoding(qlen, klen, d_model, clamp_len, attn_type, bi_data, bsz=bsz, dtype=tf_float) pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training) ##### Attention layers if mems is None: mems = [None] * n_layer for i in range(n_layer): # cache new mems new_mems.append(_cache_mem(output_h, mems[i], mem_len, reuse_len)) # segment bias if seg_id is None: r_s_bias_i = None seg_embed_i = None else: r_s_bias_i = r_s_bias if not untie_r else r_s_bias[i] seg_embed_i = seg_embed[i] with tf.variable_scope('layer_{}'.format(i)): if inp_q is not None: output_h, output_g = two_stream_rel_attn( h=output_h, g=output_g, r=pos_emb, r_w_bias=r_w_bias if not untie_r else r_w_bias[i], r_r_bias=r_r_bias if not untie_r else r_r_bias[i], seg_mat=seg_mat, r_s_bias=r_s_bias_i, seg_embed=seg_embed_i, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, mems=mems[i], target_mapping=target_mapping, d_model=d_model, n_head=n_head, d_head=d_head, dropout=dropout, dropatt=dropatt, is_training=is_training, kernel_initializer=initializer) reuse = True else: reuse = False output_h = rel_multihead_attn( h=output_h, r=pos_emb, r_w_bias=r_w_bias if not untie_r else r_w_bias[i], r_r_bias=r_r_bias if not untie_r else r_r_bias[i], seg_mat=seg_mat, r_s_bias=r_s_bias_i, seg_embed=seg_embed_i, attn_mask=non_tgt_mask, mems=mems[i], d_model=d_model, n_head=n_head, d_head=d_head, dropout=dropout, dropatt=dropatt, is_training=is_training, kernel_initializer=initializer, reuse=reuse) if inp_q is not None: output_g = positionwise_ffn(inp=output_g, d_model=d_model, d_inner=d_inner, dropout=dropout, kernel_initializer=initializer, activation_type=ff_activation, is_training=is_training) output_h = positionwise_ffn(inp=output_h, d_model=d_model, d_inner=d_inner, dropout=dropout, kernel_initializer=initializer, activation_type=ff_activation, is_training=is_training, reuse=reuse) if inp_q is not None: output = tf.layers.dropout(output_g, dropout, training=is_training) else: output = tf.layers.dropout(output_h, dropout, training=is_training) return output, new_mems, lookup_table
def __init__(self, albert_config, is_training, input_ids, input_mask=None, segment_ids=None, scope='bert', drop_pooler=False, trainable=True, **kwargs): """Constructor for AlbertModel. Args: albert_config: `AlbertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. segment_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_einsum: (optional) bool. Whether to use einsum or reshape+matmul for dense layers scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ albert_config = copy.deepcopy(albert_config) if not is_training: albert_config.hidden_dropout_prob = 0.0 albert_config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) if segment_ids is None: segment_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding = kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.word_embedding_output, self.output_embedding_table) = embedding_lookup( input_ids=input_ids, vocab_size=albert_config.vocab_size, embedding_size=albert_config.embedding_size, initializer_range=albert_config.initializer_range, word_embedding_name="word_embeddings", tilda_embeddings=tilda_embeddings, trainable=trainable) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.word_embedding_output, use_token_type=True, segment_ids=segment_ids, token_type_vocab_size=albert_config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=albert_config.initializer_range, max_position_embeddings=albert_config. max_position_embeddings, dropout_prob=albert_config.hidden_dropout_prob, trainable=trainable) with tf.variable_scope("encoder"): # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = transformer_model( input_tensor=self.embedding_output, attention_mask=input_mask, hidden_size=albert_config.hidden_size, num_hidden_layers=albert_config.num_hidden_layers, num_hidden_groups=albert_config.num_hidden_groups, num_attention_heads=albert_config.num_attention_heads, intermediate_size=albert_config.intermediate_size, inner_group_num=albert_config.inner_group_num, intermediate_act_fn=util.get_activation( albert_config.hidden_act), hidden_dropout_prob=albert_config.hidden_dropout_prob, attention_probs_dropout_prob=albert_config. attention_probs_dropout_prob, initializer_range=albert_config.initializer_range, do_return_all_layers=True, use_einsum=False, trainable=trainable) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) # trick: ignore the fully connected layer if drop_pooler: self.pooled_output = first_token_tensor else: self.pooled_output = tf.layers.dense( first_token_tensor, albert_config.hidden_size, activation=tf.tanh, kernel_initializer=util.create_initializer( albert_config.initializer_range), trainable=trainable)
def two_stream_rel_attn(h, g, r, mems, r_w_bias, r_r_bias, seg_mat, r_s_bias, seg_embed, attn_mask_h, attn_mask_g, target_mapping, d_model, n_head, d_head, dropout, dropatt, is_training, kernel_initializer, scope='rel_attn'): '''Two-stream attention with relative positional encoding.''' scale = 1 / (d_head**0.5) with tf.variable_scope(scope, reuse=False): # content based attention score if mems is not None and mems.shape.ndims > 1: cat = tf.concat([mems, h], 0) else: cat = h # content-based key head k_head_h = head_projection(cat, d_model, n_head, d_head, kernel_initializer, 'k') # content-based value head v_head_h = head_projection(cat, d_model, n_head, d_head, kernel_initializer, 'v') # position-based key head k_head_r = head_projection(r, d_model, n_head, d_head, kernel_initializer, 'r') ##### h-stream # content-stream query head q_head_h = head_projection(h, d_model, n_head, d_head, kernel_initializer, 'q') # core attention ops attn_vec_h = rel_attn_core(q_head_h, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, r_r_bias, r_s_bias, attn_mask_h, dropatt, is_training, scale) # post processing output_h = post_attention(h, attn_vec_h, d_model, n_head, d_head, dropout, is_training, kernel_initializer) with tf.variable_scope(scope, reuse=True): ##### g-stream # query-stream query head q_head_g = head_projection(g, d_model, n_head, d_head, kernel_initializer, 'q') # core attention ops if target_mapping is not None: q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping) attn_vec_g = rel_attn_core(q_head_g, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, r_r_bias, r_s_bias, attn_mask_g, dropatt, is_training, scale) attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping) else: attn_vec_g = rel_attn_core(q_head_g, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, r_r_bias, r_s_bias, attn_mask_g, dropatt, is_training, scale) # post processing output_g = post_attention(g, attn_vec_g, d_model, n_head, d_head, dropout, is_training, kernel_initializer) return output_h, output_g
def __init__(self, xlnet_config, is_training, input_ids, seg_ids, input_mask, mems=None, perm_mask=None, target_mapping=None, inp_q=None, **kwargs): ''' Args: xlnet_config: XLNetConfig. is_training: bool, whether is training or not. input_ids: int32 Tensor in shape [len, bsz], the input token IDs. seg_ids: int32 Tensor in shape [len, bsz], the input segment IDs. input_mask: float32 Tensor in shape [len, bsz], the input mask. 0 for real tokens and 1 for padding. mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory from previous batches. The length of the list equals n_layer. If None, no memory is used. perm_mask: float32 Tensor in shape [len, len, bsz]. If perm_mask[i, j, k] = 0, i attend to j in batch k; if perm_mask[i, j, k] = 1, i does not attend to j in batch k. If None, each position attends to all the others. target_mapping: float32 Tensor in shape [num_predict, len, bsz]. If target_mapping[i, j, k] = 1, the i-th predict in batch k is on the j-th token. Only used during pretraining for partial prediction. Set to None during finetuning. inp_q: float32 Tensor in shape [len, bsz]. 1 for tokens with losses and 0 for tokens without losses. Only used during pretraining for two-stream attention. Set to None during finetuning. ''' run_config = XLNetRunConfig(is_training=is_training, bi_data=False, use_tpu=False, use_bfloat16=False, dropout=(0.1 if is_training else 0.0), dropatt=(0.1 if is_training else 0.0), init='normal', init_range=0.1, init_std=0.02, clamp_len=-1) initializer = _get_initializer(run_config) # Tilda embeddings for SMART algorithm tilda_embeddings = None use_tilda_embedding = kwargs.get('use_tilda_embedding') if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') tfm_args = dict(n_token=xlnet_config.n_token, initializer=initializer, attn_type='bi', n_layer=xlnet_config.n_layer, d_model=xlnet_config.d_model, n_head=xlnet_config.n_head, d_head=xlnet_config.d_head, d_inner=xlnet_config.d_inner, ff_activation=xlnet_config.ff_activation, untie_r=xlnet_config.untie_r, is_training=run_config.is_training, use_bfloat16=run_config.use_bfloat16, use_tpu=run_config.use_tpu, dropout=run_config.dropout, dropatt=run_config.dropatt, mem_len=run_config.mem_len, reuse_len=run_config.reuse_len, bi_data=run_config.bi_data, clamp_len=run_config.clamp_len, same_length=run_config.same_length) input_args = dict(inp_k=input_ids, seg_id=seg_ids, input_mask=input_mask, mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, inp_q=inp_q, tilda_embeddings=tilda_embeddings) tfm_args.update(input_args) with tf.variable_scope('model', reuse=tf.AUTO_REUSE): (self.output, self.new_mems, self.lookup_table) = \ transformer_xl(**tfm_args) self.input_mask = input_mask self.initializer = initializer self.xlnet_config = xlnet_config self.run_config = run_config
def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout, dropatt, input_mask, is_training, initializer, scope=None, reuse=None, use_proj=True): ''' Different classification tasks may not may not share the same parameters to summarize the sequence features. If shared, one can keep the `scope` to the default value `None`. Otherwise, one should specify a different `scope` for each task. ''' with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse): if summary_type == 'last': summary = hidden[-1] elif summary_type == 'first': summary = hidden[0] elif summary_type == 'mean': summary = tf.reduce_mean(hidden, axis=0) elif summary_type == 'attn': bsz = tf.shape(hidden)[1] summary_bias = tf.get_variable('summary_bias', [d_model], dtype=hidden.dtype, initializer=initializer) summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1]) if input_mask is not None: input_mask = input_mask[None, :, :, None] summary = multihead_attn(summary_bias, hidden, hidden, input_mask, d_model, n_head, d_head, dropout, dropatt, is_training, initializer, residual=False) summary = summary[0] else: raise ValueError('Unsupported summary type %s' % summary_type) # use another projection as in BERT if use_proj: summary = tf.layers.dense(summary, d_model, activation=tf.tanh, kernel_initializer=initializer, name='summary') # dropout summary = tf.layers.dropout(summary, dropout, training=is_training, name='dropout') return summary
def _cls_self_attention_paper(self, prev_output, batch_size, max_seq_length, label_size, attention_mask=None, cls_hidden_size=128, cls_num_attention_heads=2, attention_probs_dropout_prob=0.1, initializer_range=0.02, dtype=tf.float32, trainable=True): if cls_hidden_size % cls_num_attention_heads != 0: raise ValueError( '`cls_hidden_size` (%d) is not a multiple of the number of ' '`cls_num_attention_heads` (%d)' % (cls_hidden_size, cls_num_attention_heads)) cls_attention_head_size = int(cls_hidden_size / cls_num_attention_heads) with tf.variable_scope('project'): attention_input = tf.layers.dense( prev_output, cls_hidden_size, activation='tanh', kernel_initializer=util.create_initializer(initializer_range), trainable=trainable) with tf.variable_scope('attention'): attention_heads = [] with tf.variable_scope('self'): (attention_head, _) = self.attention_layer( from_tensor=attention_input, to_tensor=attention_input, attention_mask=attention_mask, num_attention_heads=cls_num_attention_heads, size_per_head=cls_attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, dtype=dtype, trainable=trainable) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: attention_output = tf.concat(attention_heads, axis=-1) with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( attention_output[:, 0, :], cls_hidden_size, activation='tanh', kernel_initializer=util.create_initializer(initializer_range), trainable=trainable) with tf.variable_scope('output'): cls_output = tf.layers.dense( intermediate_output, label_size, kernel_initializer=util.create_initializer(initializer_range), trainable=trainable) return cls_output
def __init__(self, bert_config, is_training, dilated_ids, label_ids, max_seq_length, spad_id=1, loop=3, sample_weight=None, scope='dilated', use_tilda_embedding=False, **kwargs): super().__init__() dilated_mask = tf.cast(tf.not_equal(dilated_ids, 0), tf.float32) shape = util.get_shape_list(dilated_ids, expected_rank=2) batch_size = shape[0] dilated_seq_length = shape[1] # Tilda embeddings for SMART algorithm tilda_embeddings = None if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): # forward once if is_training: logits = self._bert_forward(bert_config, dilated_ids, dilated_mask, batch_size, dilated_seq_length, tilda_embeddings=tilda_embeddings) self.preds['LM'] = tf.argmax(logits, axis=-1) # LM loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) input_length = tf.reduce_sum(dilated_mask, axis=-1) * 2 label_mask = tf.sequence_mask(input_length, max_seq_length * 2, dtype=tf.float32) per_example_loss = \ tf.reduce_sum(per_token_loss * label_mask, axis=-1) / \ tf.reduce_sum(label_mask, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) self.total_loss = tf.reduce_mean(per_example_loss) self.losses['LM'] = per_example_loss # forward loop else: def _forward(dilated_ids, dilated_mask): logits = self._bert_forward( bert_config, dilated_ids, dilated_mask, batch_size, dilated_seq_length, tilda_embeddings=tilda_embeddings) output_ids = tf.argmax(logits, axis=-1) output_ids = tf.cast(output_ids, dtype=tf.int32) # special padding (using `spad` token) equal_zero = tf.cast(tf.equal(output_ids, 0), tf.int32) equal_zero = tf.reduce_sum(equal_zero, axis=-1) right_pad = spad_id * tf.sequence_mask( equal_zero, dilated_seq_length, dtype=tf.int32) paded = tf.concat([output_ids, right_pad], axis=-1) # extract ids of length `max_seq_length` flattened_padded = tf.reshape(paded, [-1]) is_valid = tf.cast(tf.greater(flattened_padded, 0), dtype=tf.int32) flattened_valid = tf.boolean_mask(flattened_padded, is_valid) valid = tf.reshape(flattened_valid, [batch_size, dilated_seq_length]) cutted_valid = valid[:, :max_seq_length] # replace `spad` token with `pad` non_spad_mask = tf.cast(tf.not_equal( cutted_valid, spad_id), dtype=tf.int32) output_ids = cutted_valid * non_spad_mask output_length = tf.reduce_sum(non_spad_mask, axis=-1) # dilate reshaped_ids = tf.reshape(output_ids, [batch_size, max_seq_length, 1]) reshaped_mask = tf.reshape( tf.sequence_mask(output_length, max_seq_length, dtype=tf.int32), [batch_size, max_seq_length, 1]) concat_ids = tf.concat( [reshaped_ids, tf.zeros_like(reshaped_ids)], axis=-1) concat_mask = tf.concat([ reshaped_mask, tf.zeros_like(reshaped_mask, dtype=tf.int32) ], axis=-1) dilated_ids = tf.reshape(concat_ids, [batch_size, max_seq_length * 2]) dilated_mask = tf.reshape(concat_mask, [batch_size, max_seq_length * 2]) return dilated_ids, dilated_mask for _ in range(loop): dilated_ids, dilated_mask = _forward( dilated_ids, dilated_mask) self.preds['LM'] = dilated_ids
def __init__(self, bert_config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None, embedding_size=None, input_embeddings=None, input_reprs=None, update_embeddings=True, untied_embeddings=False): '''Constructor for BertModel. Args: bert_config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is much faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to 'electra'. Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. ''' bert_config = copy.deepcopy(bert_config) if not is_training: bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(token_type_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) assert token_type_ids is not None if input_reprs is None: with tf.variable_scope( ((scope if untied_embeddings else 'electra') + '/embeddings'), reuse=tf.AUTO_REUSE): # Perform embedding lookup on the word ids if embedding_size is None: embedding_size = bert_config.hidden_size (token_embeddings, self.embedding_table) = \ embedding_lookup( input_ids=input_ids, vocab_size=bert_config.vocab_size, embedding_size=embedding_size, initializer_range=bert_config.initializer_range, word_embedding_name='word_embeddings', use_one_hot_embeddings=use_one_hot_embeddings) with tf.variable_scope( ((scope if untied_embeddings else 'electra') + '/embeddings'), reuse=tf.AUTO_REUSE): # Add positional embeddings and token type embeddings, then # layer normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=token_embeddings, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=bert_config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=bert_config.initializer_range, max_position_embeddings=\ bert_config.max_position_embeddings, dropout_prob=bert_config.hidden_dropout_prob) else: self.embedding_output = input_reprs if not update_embeddings: self.embedding_output = tf.stop_gradient(self.embedding_output) with tf.variable_scope(scope, default_name='electra'): if self.embedding_output.shape[-1] != bert_config.hidden_size: self.embedding_output = tf.layers.dense( self.embedding_output, bert_config.hidden_size, name='embeddings_project') with tf.variable_scope('encoder'): # This converts a 2D mask of shape [batch_size, seq_length] # to a 3D mask of shape [batch_size, seq_length, seq_length] # which is used for the attention scores. attention_mask = create_attention_mask_from_input_mask( token_type_ids, input_mask) # Run the stacked transformer. Output shapes # attn_maps: # [n_layers, batch_size, n_heads, seq_length, seq_length] (self.all_layer_outputs, self.attn_maps) = transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=util.get_activation( bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config. attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_layer_outputs[-1] self.pooled_output = self.sequence_output[:, 0]
def _lm_forward(self, is_training, input_tensor, input_mask, label_ids, bert_config, batch_size, max_seq_length, prob, scope, name, sample_weight=None, hidden_dropout_prob=0.1, initializer_range=0.02): with tf.variable_scope(scope): with tf.variable_scope('verifier'): logits = tf.layers.dense( input_tensor, 2, kernel_initializer=util.create_initializer( bert_config.initializer_range), trainable=True) verifier_label_ids = tf.cast(tf.greater(label_ids, 0), tf.int32) # loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(verifier_label_ids, depth=2) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) input_mask = tf.cast(input_mask, tf.float32) per_token_loss *= input_mask / tf.reduce_sum( input_mask, keepdims=True, axis=-1) per_example_loss = tf.reduce_sum(per_token_loss, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) if prob != 0: self.total_loss += tf.reduce_mean(per_example_loss) verifier_loss = per_example_loss verifier_preds = tf.argmax(logits, axis=-1) with tf.variable_scope('prediction'): with tf.variable_scope('intermediate'): logits = tf.layers.dense( input_tensor, bert_config.hidden_size * 4, kernel_initializer=util.create_initializer( bert_config.initializer_range), activation=util.gelu, trainable=True) with tf.variable_scope('output'): logits = tf.layers.dense( logits, bert_config.hidden_size, kernel_initializer=util.create_initializer( bert_config.initializer_range), trainable=True) flattened = tf.reshape( logits, [batch_size * max_seq_length, bert_config.hidden_size]) logits = tf.matmul(flattened, self.embedding_table, transpose_b=True) logits = tf.reshape( logits, [-1, max_seq_length, bert_config.vocab_size]) # loss log_probs = tf.nn.log_softmax(logits, axis=-1) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size) per_token_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) input_mask *= tf.cast(verifier_preds, tf.float32) per_token_loss *= input_mask / ( tf.reduce_sum(input_mask, keepdims=True, axis=-1) + 1e-6) per_example_loss = tf.reduce_sum(per_token_loss, axis=-1) if sample_weight is not None: per_example_loss *= tf.expand_dims(sample_weight, axis=-1) if prob != 0: self.total_loss += tf.reduce_mean(per_example_loss) self.losses[name + '_loss'] = verifier_loss self.preds[name + '_preds'] = \ tf.argmax(logits, axis=-1) * verifier_preds
def _get_generator_output(self, inputs, sample_weight, generator): '''Masked language modeling softmax layer.''' def gather_indexes(sequence_tensor, positions): sequence_shape = util.get_shape_list(sequence_tensor, 3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor input_tensor = gather_indexes(generator.get_sequence_output(), inputs.masked_lm_positions) with tf.variable_scope('generator_predictions'): input_tensor = tf.layers.dense( input_tensor, units=self.config.embedding_size, activation=util.get_activation(self.bert_config.hidden_act), kernel_initializer=util.create_initializer( self.bert_config.initializer_range)) input_tensor = util.layer_norm(input_tensor) output_bias = tf.get_variable('output_bias', shape=[self.bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, generator.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits, axis=-1, name='MLM_probs') preds = tf.argmax(logits, axis=-1) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(inputs.masked_lm_ids, [-1]) masked_lm_weights = inputs.masked_lm_weights if sample_weight is not None: sample_weight = tf.expand_dims(tf.cast(sample_weight, dtype=tf.float32), axis=-1) masked_lm_weights *= sample_weight label_weights = tf.reshape(masked_lm_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=self.bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) per_example_loss = label_weights * per_example_loss numerator = tf.reduce_sum(per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-6 loss = numerator / denominator MLMOutput = collections.namedtuple( 'MLMOutput', ['logits', 'probs', 'loss', 'per_example_loss', 'preds']) return MLMOutput(logits=logits, probs=probs, per_example_loss=per_example_loss, loss=loss, preds=preds)
def __init__(self, bert_config, is_training, input_ids, add_label_ids, del_label_ids, sample_weight=None, add_prob=0, del_prob=0, scope='bert', use_tilda_embedding=False, **kwargs): super().__init__() input_mask = tf.cast(tf.not_equal(input_ids, 0), tf.float32) shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = shape[0] max_seq_length = shape[1] if not is_training: bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 # Tilda embeddings for SMART algorithm tilda_embeddings = None if use_tilda_embedding: with tf.variable_scope('', reuse=True): tilda_embeddings = tf.get_variable('tilda_embeddings') with tf.variable_scope(scope): # forward once hidden = self._bert_forward(bert_config, input_ids, input_mask, batch_size, max_seq_length, tilda_embeddings=tilda_embeddings) # additional_position_embeddings = tf.get_variable( # name='position_embeddings', # shape=[bert_config.max_position_embeddings, # bert_config.hidden_size], # initializer=util.create_initializer( # bert_config.initializer_range)) # embedding_slice = tf.slice( # additional_position_embeddings, [0, 0], [max_seq_length, -1]) # hidden += tf.reshape( # embedding_slice, # [1, max_seq_length, bert_config.hidden_size]) self.total_loss = 0 self._lm_forward(is_training, input_tensor=hidden, input_mask=input_mask, label_ids=add_label_ids, bert_config=bert_config, batch_size=batch_size, max_seq_length=max_seq_length, prob=add_prob, scope='cls/add', name='add', sample_weight=sample_weight) self._cls_forward(is_training, input_tensor=hidden, input_mask=input_mask, label_ids=del_label_ids, bert_config=bert_config, batch_size=batch_size, max_seq_length=max_seq_length, prob=del_prob, scope='cls/del', name='del', sample_weight=sample_weight)
def __init__(self, is_training, input_tensor, is_supervised, is_expanded, label_ids, label_size=2, sample_weight=None, scope='cls/seq_relationship', hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, global_step=None, num_train_steps=None, uda_softmax_temp=-1, uda_confidence_thresh=-1, tsa_schedule='linear', **kwargs): super().__init__(**kwargs) is_supervised = tf.cast(is_supervised, tf.float32) is_expanded = tf.cast(is_expanded, tf.float32) hidden_size = input_tensor.shape.as_list()[-1] with tf.variable_scope(scope): output_weights = tf.get_variable( 'output_weights', shape=[label_size, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = util.dropout( input_tensor, hidden_dropout_prob if is_training else 0.0) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) with tf.variable_scope('sup_loss'): # reshape sup_ori_log_probs = tf.boolean_mask(log_probs, mask=(1.0 - is_expanded), axis=0) sup_log_probs = tf.boolean_mask(sup_ori_log_probs, mask=is_supervised, axis=0) sup_label_ids = tf.boolean_mask(label_ids, mask=is_supervised, axis=0) self.preds['preds'] = tf.argmax(sup_ori_log_probs, axis=-1) one_hot_labels = tf.one_hot(sup_label_ids, depth=label_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum( one_hot_labels * sup_log_probs, axis=-1) loss_mask = tf.ones_like(per_example_loss, dtype=tf.float32) correct_label_probs = tf.reduce_sum(one_hot_labels * tf.exp(sup_log_probs), axis=-1) if is_training and tsa_schedule: tsa_start = 1.0 / label_size tsa_threshold = get_tsa_threshold(tsa_schedule, global_step, num_train_steps, tsa_start, end=1) larger_than_threshold = tf.greater(correct_label_probs, tsa_threshold) loss_mask = loss_mask * ( 1 - tf.cast(larger_than_threshold, tf.float32)) loss_mask = tf.stop_gradient(loss_mask) per_example_loss = per_example_loss * loss_mask if sample_weight is not None: sup_sample_weight = tf.boolean_mask(sample_weight, mask=is_supervised, axis=0) per_example_loss *= tf.cast(sup_sample_weight, dtype=tf.float32) sup_loss = (tf.reduce_sum(per_example_loss) / tf.maximum(tf.reduce_sum(loss_mask), 1)) self.losses['supervised'] = per_example_loss with tf.variable_scope('unsup_loss'): # reshape ori_log_probs = tf.boolean_mask(sup_ori_log_probs, mask=(1.0 - is_supervised), axis=0) aug_log_probs = tf.boolean_mask(log_probs, mask=is_expanded, axis=0) sup_ori_logits = tf.boolean_mask(logits, mask=(1.0 - is_expanded), axis=0) ori_logits = tf.boolean_mask(sup_ori_logits, mask=(1.0 - is_supervised), axis=0) unsup_loss_mask = 1 if uda_softmax_temp != -1: tgt_ori_log_probs = tf.nn.log_softmax(ori_logits / uda_softmax_temp, axis=-1) tgt_ori_log_probs = tf.stop_gradient(tgt_ori_log_probs) else: tgt_ori_log_probs = tf.stop_gradient(ori_log_probs) if uda_confidence_thresh != -1: largest_prob = tf.reduce_max(tf.exp(ori_log_probs), axis=-1) unsup_loss_mask = tf.cast( tf.greater(largest_prob, uda_confidence_thresh), tf.float32) unsup_loss_mask = tf.stop_gradient(unsup_loss_mask) per_example_loss = kl_for_log_probs( tgt_ori_log_probs, aug_log_probs) * unsup_loss_mask if sample_weight is not None: unsup_sample_weight = tf.boolean_mask(sample_weight, mask=(1.0 - is_supervised), axis=0) per_example_loss *= tf.cast(unsup_sample_weight, dtype=tf.float32) unsup_loss = tf.reduce_mean(per_example_loss) self.losses['unsupervised'] = per_example_loss self.total_loss = sup_loss + unsup_loss
def _bert_forward(self, bert_config, input_ids, input_mask, batch_size, max_seq_length, dtype=tf.float32, trainable=True, tilda_embeddings=None): with tf.variable_scope('embeddings'): (embedding_output, self.embedding_table) = self.embedding_lookup( input_ids=input_ids, vocab_size=bert_config.vocab_size, batch_size=batch_size, max_seq_length=max_seq_length, embedding_size=bert_config.hidden_size, initializer_range=bert_config.initializer_range, word_embedding_name='word_embeddings', dtype=dtype, trainable=trainable, tilda_embeddings=tilda_embeddings) # Add positional embeddings and token type embeddings # layer normalize and perform dropout. embedding_output = self.embedding_postprocessor( input_tensor=embedding_output, batch_size=batch_size, max_seq_length=max_seq_length, hidden_size=bert_config.hidden_size, use_token_type=False, use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=bert_config.initializer_range, max_position_embeddings=\ bert_config.max_position_embeddings, dropout_prob=bert_config.hidden_dropout_prob, dtype=dtype, trainable=trainable) with tf.variable_scope('encoder'): attention_mask = self.create_attention_mask_from_input_mask( input_mask, batch_size, max_seq_length, dtype=dtype) # stacked transformers all_encoder_layers = self.transformer_model( input_tensor=embedding_output, batch_size=batch_size, max_seq_length=max_seq_length, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=util.get_activation( bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=\ bert_config.attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, dtype=dtype, trainable=trainable) return all_encoder_layers[-1]
def transformer_model(self, input_tensor, batch_size, max_seq_length, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=util.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, dtype=tf.float32, trainable=True): if hidden_size % num_attention_heads != 0: raise ValueError( 'The hidden size (%d) is not a multiple of the number ' 'of attention heads (%d)' % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) prev_output = util.reshape_to_matrix(input_tensor) self.attention_scores = [] all_layer_outputs = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope('layer_%d' % layer_idx): layer_input = prev_output def _build_forward(layer_input): with tf.variable_scope('attention'): attention_heads = [] with tf.variable_scope('self'): (attention_head, attention_scores) = \ self.attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=\ attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, dtype=dtype, trainable=trainable) attention_heads.append(attention_head) self.attention_scores.append(attention_scores) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: attention_output = tf.concat(attention_heads, axis=-1) with tf.variable_scope('output'): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=trainable) attention_output = util.dropout( attention_output, hidden_dropout_prob) attention_output = util.layer_norm( attention_output + layer_input, trainable=trainable) # The activation is only applied to the `intermediate` # hidden layer. with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=util.create_initializer( initializer_range), trainable=trainable) # Down-project back to hidden_size then add the residual. with tf.variable_scope('output'): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=trainable) layer_output = util.dropout(layer_output, hidden_dropout_prob) layer_output = util.layer_norm(layer_output + attention_output, trainable=trainable) return layer_output layer_output = _build_forward(layer_input) prev_output = layer_output all_layer_outputs.append(layer_output) original_shape = [batch_size * max_seq_length, hidden_size] input_shape = [batch_size, max_seq_length, hidden_size] final_all_layer_outputs = [] for layer_output in all_layer_outputs: final_output = util.reshape_from_matrix( layer_output, input_shape, original_shape=original_shape) final_all_layer_outputs.append(final_output) return final_all_layer_outputs
def layer_norm(input_tensor, center=True, scale=True, activation_fn=None, variables_collections=None, outputs_collections=None, begin_norm_axis=-1, begin_params_axis=-1, trainable=True): ''' Runs layer normalization on the last dimension of the tensor. Args: input_tensor: A tensor having rank `R`. The normalization is performed over axes `begin_norm_axis ... R - 1` and centering and scaling parameters are calculated over `begin_params_axis ... R - 1`. center: If True, add offset of `beta` to normalized tensor. If False, `beta` is ignored. scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the next layer is linear (also e.g. `nn.relu`), this can be disabled since the scaling can be done by the next layer. activation_fn: Activation function, default set to None to skip it and maintain a linear activation. variables_collections: Optional collections for the variables. outputs_collections: Collections to add the outputs. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). begin_norm_axis: The first normalization dimension: normalization will be performed along dimensions `begin_norm_axis : rank(input_tensor)` begin_params_axis: The first parameter (beta, gamma) dimension: scale and centering parameters will have dimensions `begin_params_axis : rank(input_tensor)` and will be broadcast with the normalized inputs accordingly. scope: Optional scope for `variable_scope`. Returns: A `Tensor` representing the output of the operation, having the same shape and dtype as `input_tensor`. Raises: ValueError: If the rank of `input_tensor` is not known at graph build time, or if `input_tensor.shape[begin_params_axis:]` is not fully defined at graph build time. ''' with tf.variable_scope('LayerNorm'): inputs_shape = input_tensor.shape inputs_rank = inputs_shape.ndims if inputs_rank is None: raise ValueError('Inputs %s has undefined rank.' % input_tensor.name) dtype = input_tensor.dtype.base_dtype if begin_norm_axis < 0: begin_norm_axis = inputs_rank + begin_norm_axis if begin_params_axis >= inputs_rank or begin_norm_axis >= inputs_rank: raise ValueError( 'begin_params_axis (%d) and begin_norm_axis (%d) ' 'must be < rank(inputs) (%d)' % (begin_params_axis, begin_norm_axis, inputs_rank)) params_shape = inputs_shape[begin_params_axis:] if not params_shape.is_fully_defined(): raise ValueError( 'Inputs %s: shape(inputs)[%s:] is not fully defined: %s' % (input_tensor.name, begin_params_axis, inputs_shape)) # Allocate parameters for the beta and gamma of the normalization. beta, gamma = None, None if center: beta = tf.get_variable( 'beta', shape=params_shape, dtype=dtype, initializer=tf.zeros_initializer(), trainable=trainable) if scale: gamma = tf.get_variable( 'gamma', shape=params_shape, dtype=dtype, initializer=tf.ones_initializer(), trainable=trainable) # By default, compute the moments across all the dimensions except the # one with index 0. norm_axes = list(range(begin_norm_axis, inputs_rank)) mean, variance = tf.nn.moments(input_tensor, norm_axes, keep_dims=True) # Compute layer normalization using the batch_normalization function. # Note that epsilon must be increased for float16 due to the limited # representable range. variance_epsilon = 1e-12 if dtype != tf.float16 else 1e-3 outputs = tf.nn.batch_normalization( input_tensor, mean, variance, offset=beta, scale=gamma, variance_epsilon=variance_epsilon) outputs.set_shape(inputs_shape) if activation_fn is not None: outputs = activation_fn(outputs) return outputs
def __init__(self, bert_config, is_training, encoder, masked_lm_positions, masked_lm_ids, masked_lm_weights, next_sentence_labels, sample_weight=None, scope_lm='cls/predictions', scope_cls='cls/seq_relationship', trainable=True, use_nsp_loss=True, **kwargs): super(BERTDecoder, self).__init__(**kwargs) def gather_indexes(sequence_tensor, positions): sequence_shape = util.get_shape_list(sequence_tensor, 3) batch_size = sequence_shape[0] seq_length = sequence_shape[1] width = sequence_shape[2] flat_offsets = tf.reshape( tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) flat_positions = tf.reshape(positions + flat_offsets, [-1]) flat_sequence_tensor = tf.reshape(sequence_tensor, [batch_size * seq_length, width]) output_tensor = tf.gather(flat_sequence_tensor, flat_positions) return output_tensor scalar_losses = [] # masked language modeling input_tensor = gather_indexes(encoder.get_sequence_output(), masked_lm_positions) with tf.variable_scope(scope_lm): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=util.get_activation(bert_config.hidden_act), kernel_initializer=util.create_initializer( bert_config.initializer_range)) input_tensor = util.layer_norm(input_tensor) output_bias = tf.get_variable('output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), trainable=trainable) logits = tf.matmul(input_tensor, encoder.get_embedding_table(), transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits, axis=-1, name='MLM_probs') log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(masked_lm_ids, [-1]) if sample_weight is not None: sample_weight = tf.expand_dims(tf.cast(sample_weight, dtype=tf.float32), axis=-1) masked_lm_weights *= sample_weight label_weights = tf.reshape(masked_lm_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) per_example_loss = label_weights * per_example_loss numerator = tf.reduce_sum(per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator scalar_losses.append(loss) self.losses['MLM_losses'] = per_example_loss self.preds['MLM_preds'] = tf.argmax(probs, axis=-1) # next sentence prediction with tf.variable_scope(scope_cls): output_weights = tf.get_variable( 'output_weights', shape=[2, bert_config.hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=trainable) output_bias = tf.get_variable('output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) logits = tf.matmul(encoder.get_pooled_output(), output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits, axis=-1, name='probs') log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(next_sentence_labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) if sample_weight is not None: per_example_loss = (tf.cast(sample_weight, dtype=tf.float32) * per_example_loss) loss = tf.reduce_mean(per_example_loss) if use_nsp_loss: scalar_losses.append(loss) self.losses['NSP_losses'] = per_example_loss self.probs['NSP_probs'] = probs self.preds['NSP_preds'] = tf.argmax(probs, axis=-1) self.total_loss = tf.add_n(scalar_losses)
def dynamic_transformer_model(self, is_training, input_tensor, input_mask, batch_size, max_seq_length, label_size, attention_mask=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, intermediate_act_fn=util.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, dtype=tf.float32, cls_model='self-attention', cls_hidden_size=128, cls_num_attention_heads=2, speed=0.1, ignore_cls=None): if hidden_size % num_attention_heads != 0: raise ValueError( 'The hidden size (%d) is not a multiple of the number of ' 'attention heads (%d)' % (hidden_size, num_attention_heads)) attention_head_size = int(hidden_size / num_attention_heads) keep_cls = list(range(num_hidden_layers + 1)) keep_cls = [ cls_idx for cls_idx in keep_cls if cls_idx not in ignore_cls ] all_layer_outputs = [] all_layer_cls_outputs = collections.OrderedDict() prev_output = input_tensor prev_mask = input_mask for layer_idx in range(num_hidden_layers): with tf.variable_scope('layer_%d' % layer_idx): # build child classifier if is_training or layer_idx not in ignore_cls: with tf.variable_scope('distill'): # FCN + Self_Attention + FCN + FCN if cls_model == 'self-attention-paper': cls_output = self._cls_self_attention_paper( prev_output, batch_size, max_seq_length, label_size, attention_mask=attention_mask, cls_hidden_size=cls_hidden_size, cls_num_attention_heads=\ cls_num_attention_heads, attention_probs_dropout_prob=\ attention_probs_dropout_prob, initializer_range=initializer_range, dtype=tf.float32, trainable=True) # Self_Attention + FCN elif cls_model == 'self-attention': cls_output = self._cls_self_attention( prev_output, batch_size, max_seq_length, label_size, attention_mask=attention_mask, cls_hidden_size=cls_hidden_size, cls_num_attention_heads=\ cls_num_attention_heads, attention_probs_dropout_prob=\ attention_probs_dropout_prob, initializer_range=initializer_range, dtype=tf.float32, trainable=True) # FCN elif cls_model == 'fcn': cls_output = self._cls_fcn( prev_output, label_size, hidden_size=hidden_size, initializer_range=initializer_range, dtype=tf.float32, trainable=True) else: raise ValueError( 'Invalid `cls_model = %s`. Pick one from ' '`self-attention-paper`, `self-attention` ' 'and `fcn`' % cls_model) # distill core layer_cls_output = tf.nn.softmax(cls_output, axis=-1, name='cls_%d' % layer_idx) uncertainty = tf.reduce_sum(layer_cls_output * tf.log(layer_cls_output), axis=-1) uncertainty /= tf.log(1 / label_size) # branching only in inference if not is_training: # last output if layer_idx == keep_cls[-1]: all_layer_outputs.append(prev_output) all_layer_cls_outputs[layer_idx] = layer_cls_output return (all_layer_outputs, all_layer_cls_outputs) mask = tf.less(uncertainty, speed) unfinished_mask = \ (tf.ones_like(mask, dtype=dtype) - tf.cast(mask, dtype=dtype)) prev_output = tf.boolean_mask(prev_output, mask=unfinished_mask, axis=0) prev_mask = tf.boolean_mask(prev_mask, mask=unfinished_mask, axis=0) all_layer_cls_outputs[layer_idx] = layer_cls_output # new attention mask input_shape = util.get_shape_list(prev_output) batch_size = input_shape[0] max_seq_length = input_shape[1] attention_mask = \ self.create_attention_mask_from_input_mask( prev_mask, batch_size, max_seq_length, dtype=dtype) # originial stream with tf.variable_scope('attention'): attention_heads = [] with tf.variable_scope('self'): (attention_head, _) = self.attention_layer( from_tensor=prev_output, to_tensor=prev_output, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=\ attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=max_seq_length, to_max_seq_length=max_seq_length, dtype=dtype, trainable=False) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: attention_output = tf.concat(attention_heads, axis=-1) with tf.variable_scope('output'): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=False) attention_output = util.dropout( attention_output, hidden_dropout_prob) attention_output = util.layer_norm(attention_output + prev_output, trainable=False) # The activation is only applied to the `intermediate` # hidden layer. with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=util.create_initializer( initializer_range), trainable=False) # Down-project back to hidden_size then add the residual. with tf.variable_scope('output'): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=False) layer_output = util.dropout(layer_output, hidden_dropout_prob) layer_output = util.layer_norm(layer_output + attention_output, trainable=False) prev_output = layer_output all_layer_outputs.append(layer_output) return (all_layer_outputs, all_layer_cls_outputs)
def __init__(self, bert_config, is_training, input_tensor, sa_mask, label_ids, sample_weight=None, scope='sanet', alpha=0.5, hidden_dropout_prob=0.1, initializer_range=0.02, trainable=True, **kwargs): super().__init__(**kwargs) shape = util.get_shape_list(input_tensor) batch_size = shape[0] seq_length = shape[1] hidden_size = shape[2] sa_mask = tf.reshape(sa_mask, [batch_size, seq_length, seq_length]) with tf.variable_scope(scope): with tf.variable_scope('sentence_attention'): (sa_output, _) = self.attention_layer( from_tensor=input_tensor, to_tensor=input_tensor, attention_mask=sa_mask, num_attention_heads=bert_config.num_attention_heads, size_per_head=\ hidden_size // bert_config.num_attention_heads, attention_probs_dropout_prob=\ bert_config.hidden_dropout_prob, initializer_range=bert_config.initializer_range, do_return_2d_tensor=False, batch_size=batch_size, from_max_seq_length=seq_length, to_max_seq_length=seq_length, trainable=trainable) with tf.variable_scope('cls/mrc'): output_weights = tf.get_variable( 'output_weights', shape=[2, hidden_size], initializer=util.create_initializer(initializer_range), trainable=trainable) output_bias = tf.get_variable( 'output_bias', shape=[2], initializer=tf.zeros_initializer(), trainable=trainable) output_layer = alpha * sa_output + (1 - alpha) * input_tensor output_layer = tf.reshape(output_layer, [-1, hidden_size]) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [-1, seq_length, 2]) logits = tf.transpose(logits, [0, 2, 1]) probs = tf.nn.softmax(logits, axis=-1, name='probs') self.probs['probs'] = probs self.preds['preds'] = tf.argmax(logits, axis=-1) start_one_hot_labels = tf.one_hot(label_ids[:, 0], depth=seq_length, dtype=tf.float32) end_one_hot_labels = tf.one_hot(label_ids[:, 1], depth=seq_length, dtype=tf.float32) start_log_probs = tf.nn.log_softmax(logits[:, 0, :], axis=-1) end_log_probs = tf.nn.log_softmax(logits[:, 1, :], axis=-1) per_example_loss = ( -0.5 * tf.reduce_sum(start_one_hot_labels * start_log_probs, axis=-1) - 0.5 * tf.reduce_sum(end_one_hot_labels * end_log_probs, axis=-1)) if sample_weight is not None: per_example_loss *= sample_weight self.total_loss = tf.reduce_mean(per_example_loss) self.losses['losses'] = per_example_loss
def __init__(self, bert_config, is_training, input_ids, input_mask, segment_ids, sample_weight=None, scope='bert', dtype=tf.float32, drop_pooler=False, cls_model='self-attention', label_size=2, speed=0.1, ignore_cls='0', **kwargs): super(FastBERTCLSDistillor, self).__init__() if not ignore_cls: ignore_cls = [] if isinstance(ignore_cls, str): ignore_cls = ignore_cls.replace(' ', '').split(',') ignore_cls = list(map(int, ignore_cls)) elif isinstance(ignore_cls, list): ignore_cls = list(map(int, ignore_cls)) else: raise ValueError( '`ignore_cls` should be a list of child-classifier ids or ' 'a string seperated with commas.') if not speed: raise ValueError( '`speed` should be a float number between `0` and `1`.') bert_config = copy.deepcopy(bert_config) bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 input_shape = util.get_shape_list(input_ids, expected_rank=2) batch_size = input_shape[0] max_seq_length = input_shape[1] with tf.variable_scope(scope): with tf.variable_scope('embeddings'): (self.embedding_output, self.embedding_table) = \ self.embedding_lookup( input_ids=input_ids, vocab_size=bert_config.vocab_size, batch_size=batch_size, max_seq_length=max_seq_length, embedding_size=bert_config.hidden_size, initializer_range=bert_config.initializer_range, word_embedding_name='word_embeddings', dtype=dtype, trainable=False, tilda_embeddings=None) # Add positional embeddings and token type embeddings # layer normalize and perform dropout. self.embedding_output = self.embedding_postprocessor( input_tensor=self.embedding_output, batch_size=batch_size, max_seq_length=max_seq_length, hidden_size=bert_config.hidden_size, use_token_type=True, segment_ids=segment_ids, token_type_vocab_size=bert_config.type_vocab_size, token_type_embedding_name='token_type_embeddings', use_position_embeddings=True, position_embedding_name='position_embeddings', initializer_range=bert_config.initializer_range, max_position_embeddings=\ bert_config.max_position_embeddings, dropout_prob=bert_config.hidden_dropout_prob, dtype=dtype, trainable=False) with tf.variable_scope('encoder'): attention_mask = self.create_attention_mask_from_input_mask( input_mask, batch_size, max_seq_length, dtype=dtype) # stacked transformers (self.all_encoder_layers, self.all_cls_layers) = \ self.dynamic_transformer_model( is_training, input_tensor=self.embedding_output, input_mask=input_mask, batch_size=batch_size, max_seq_length=max_seq_length, label_size=label_size, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=util.get_activation( bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=\ bert_config.attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, dtype=dtype, cls_model=cls_model, speed=speed, ignore_cls=ignore_cls) self.sequence_output = self.all_encoder_layers[-1] with tf.variable_scope('pooler'): first_token_tensor = self.sequence_output[:, 0, :] # trick: ignore the fully connected layer if drop_pooler: self.pooled_output = first_token_tensor else: self.pooled_output = tf.layers.dense( first_token_tensor, bert_config.hidden_size, activation=tf.tanh, kernel_initializer=util.create_initializer( bert_config.initializer_range), trainable=False) # teacher classifier if bert_config.num_hidden_layers not in ignore_cls: with tf.variable_scope('cls/seq_relationship'): output_weights = tf.get_variable( 'output_weights', shape=[label_size, bert_config.hidden_size], initializer=util.create_initializer( bert_config.initializer_range), trainable=False) output_bias = tf.get_variable( 'output_bias', shape=[label_size], initializer=tf.zeros_initializer(), trainable=False) logits = tf.matmul(self.pooled_output, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) probs = tf.nn.softmax(logits, axis=-1) # distillation if is_training: losses = [] for cls_probs in self.all_cls_layers.values(): # KL-Divergence per_example_loss = tf.reduce_sum( cls_probs * (tf.log(cls_probs) - tf.log(probs)), axis=-1) if sample_weight is not None: per_example_loss *= tf.cast(sample_weight, dtype=tf.float32) loss = tf.reduce_mean(per_example_loss) losses.append(loss) distill_loss = tf.add_n(losses) self.total_loss = distill_loss self.losses['losses'] = distill_loss else: if bert_config.num_hidden_layers not in ignore_cls: self.all_cls_layers[bert_config.num_hidden_layers] = probs self.probs['probs'] = tf.concat(list(self.all_cls_layers.values()), axis=0, name='probs')
def mlp(x, scope, n_state, *, hparams): with tf.variable_scope(scope): nx = x.shape[-1].value h = gelu(conv1d(x, 'c_fc', n_state)) h2 = conv1d(h, 'c_proj', nx) return h2
def _build_forward(layer_input): with tf.variable_scope('attention'): with tf.variable_scope('self'): layer_input *= tf.cast(tf.expand_dims(input_mask, axis=-1), dtype=tf.float32) attention_layer = Attention( hidden_size=hidden_size, num_heads=num_attention_heads, attention_dropout=attention_probs_dropout_prob, kernel_transformation=\ self.kernel_transformation, numerical_stabilizer=0.001, causal=False, projection_matrix_type=True \ if bool(self.nb_random_features) else None, nb_random_features=self.nb_random_features) attention_layer.build(layer_input.shape) attention_output = attention_layer.call( layer_input, layer_input, bias=None, training=is_training, cache=None, decode_loop_step=None) with tf.variable_scope('output'): attention_output = tf.layers.dense( attention_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=trainable) attention_output = util.dropout( attention_output, hidden_dropout_prob) attention_output = util.layer_norm( attention_output + layer_input, trainable=trainable) # The activation is only applied to the `intermediate` # hidden layer. with tf.variable_scope('intermediate'): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=util.create_initializer( initializer_range), trainable=trainable) # Down-project back to hidden_size then add the residual. with tf.variable_scope('output'): layer_output = tf.layers.dense( intermediate_output, hidden_size, kernel_initializer=util.create_initializer( initializer_range), trainable=trainable) layer_output = util.dropout(layer_output, hidden_dropout_prob) layer_output = util.layer_norm(layer_output + attention_output, trainable=trainable) return layer_output