def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, image_vector, use_one_hot_embeddings, scope): """Creates a model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, scope=scope) if FLAGS.ignore_image: logit = tf.layers.dense(model.get_pooled_output(), 1, activation=tf.tanh, kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) logit = tf.squeeze(logit, axis=1) else: logit = tf.einsum("ij,ij->i", tf.layers.dense( image_vector, bert_config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( bert_config.initializer_range)), model.get_pooled_output(), name="inner") return tf.stack([-logit, logit], axis=1)
def __init__( self, ): self.X = tf.placeholder(tf.int32, [None, None]) model = modeling.BertModel( config=bert_config, is_training=False, input_ids=self.X, use_one_hot_embeddings=False) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units = bert_config.hidden_size, activation = modeling.get_activation(bert_config.hidden_act), kernel_initializer = modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape = [bert_config.vocab_size], initializer = tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b = True) self.logits = tf.nn.bias_add(logits, output_bias)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, top_k_indices, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs_student = tf.nn.log_softmax(logits, axis=-1) probs_student = tf.nn.softmax(logits, axis=-1) prob_shape = tf.shape(log_probs_student) new_shape = [prob_shape[0], truncation_factor] #[batch_size*seq_len,truncation_factor] top_k_indices = tf.reshape(top_k_indices, new_shape) top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices) top_k_probs_student = tf.batch_gather(probs_student, top_k_indices) return top_k_log_probs_student, top_k_probs_student
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def __init__(self, config, is_training, scope_prefix=""): config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 self._config = config self._initializer = modeling.create_initializer( config.initializer_range) self.scope_prefix = scope_prefix + "/" if scope_prefix else "" with tf.variable_scope(self.scope_prefix + "bert/embeddings"): self._embedding_table = tf.get_variable( name="word_embeddings", shape=[config.vocab_size, config.hidden_size], initializer=self.initializer) self._segment_table = tf.get_variable( name="segment_embeddings", shape=[config.max_segments, config.hidden_size], initializer=self.initializer) self._position_table = tf.get_variable( name="position_embeddings", shape=[config.max_positions, self.config.hidden_size], initializer=self.initializer) self._condition_position_table = tf.get_variable( name="condition_position_embeddings", shape=[config.max_conditions, self.config.hidden_size], initializer=self.initializer) self._image_region_table = tf.get_variable( name="image_region_embeddings", shape=[config.max_image_regions, self.config.hidden_size], initializer=self.initializer) self._image_order_table = tf.get_variable( name="image_order_embeddings", shape=[config.max_image_regions, self.config.hidden_size], initializer=self.initializer)
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, use_one_hot_embeddings): """Creates a classification model.""" model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) output_layer = model.get_pooled_output() hidden_size = output_layer.shape[-1].value num_labels = 2 # This is hardcoded for binary classification with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[num_labels, hidden_size], initializer=modeling.create_initializer( bert_config.initializer_range)) output_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) probabilities = tf.nn.softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits, probabilities)
def get_masked_lm_output(self): self.input_tensor = self.gather_indexes() with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): self.input_tensor = tf.layers.dense( self.input_tensor, units=self.bert_config.hidden_size, activation=modeling.get_activation( self.bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self.bert_config.initializer_range)) self.input_tensor = modeling.layer_norm(self.input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[self.bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(self.input_tensor, self.output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) flat_masked_lm_ids = tf.reshape(self.masked_lm_ids, [-1]) one_hot_labels = tf.one_hot(flat_masked_lm_ids, depth=self.bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # TODO: dynamic gather from per_example_loss??? loss = tf.reshape(per_example_loss, [-1, tf.shape(self.masked_lm_positions)[1]]) return loss
def _project(t, name): return dense_layer_3d( input_tensor=t, num_attention_heads=num_attention_heads, size_per_head=size_per_head, initializer=modeling.create_initializer(initializer_range), activation=None, name=name)
def compute_image_transformer( self, input_ids, input_image, input_image_mask, input_positions, reuse=None, ): """Build the image transformer.""" with tf.variable_scope(self.scope_prefix + "transformer", reuse=reuse): with tf.variable_scope("bridge"): image_emb = tf.layers.dense( inputs=input_image, units=self.config.hidden_size, activation=tf.nn.relu, kernel_initializer=modeling.create_initializer( self.config.initializer_range), reuse=reuse) with tf.variable_scope("embeddings"): input_emb = tf.gather(self.embedding_table, input_ids) image_emb = tf.concat([input_emb, image_emb], axis=1) batch_size = tensor_utils.shape(image_emb, 0) sequence_length = tensor_utils.shape(image_emb, 1) position_emb = tf.gather(self.image_region_table, input_positions) position_emb = tf.pad(position_emb, [[0, 0], [1, 0], [0, 0]]) input_order = tf.range(tensor_utils.shape(image_emb, 1)) input_order = tf.tile(tf.expand_dims(input_order, 0), [tensor_utils.shape(image_emb, 0), 1]) order_emb = tf.gather(self.image_order_table, input_order) input_segment_id = tf.fill([batch_size, sequence_length], self.IMG) segment_emb = tf.gather(self.segment_table, input_segment_id) input_emb = image_emb + position_emb + order_emb + segment_emb input_emb = modeling.layer_norm_and_dropout( input_emb, self.config.hidden_dropout_prob) with tf.variable_scope("image/encoder"): sequence_output, output_cache = compute_transformer( input_tensor=input_emb, attention_mask=tf.expand_dims(input_image_mask, 1), hidden_size=self.config.hidden_size, num_hidden_layers=self.config.num_hidden_layers, num_attention_heads=self.config.num_attention_heads, intermediate_size=self.config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.config.hidden_act), hidden_dropout_prob=self.config.hidden_dropout_prob, attention_probs_dropout_prob=( self.config.attention_probs_dropout_prob), initializer_range=self.config.initializer_range, input_cache=None) return sequence_output, output_cache
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_weights, truncated_masked_lm_probs_teacher, top_k_indices, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs_student = tf.nn.log_softmax(logits, axis=-1) label_weights = tf.reshape(label_weights, [-1]) prob_shape = tf.shape(log_probs_student) new_shape = [prob_shape[0], truncation_factor ] #[batch_size*seq_len,truncation_factor] top_k_indices = tf.reshape(top_k_indices, new_shape) top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices) truncated_masked_lm_probs_teacher = tf.reshape( truncated_masked_lm_probs_teacher, new_shape) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum( truncated_masked_lm_probs_teacher * top_k_log_probs_student, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs_student)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights_flat = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights_flat * per_example_loss) denominator = tf.reduce_sum(label_weights_flat) + 1e-5 loss = numerator / denominator batch_size = tf.cast(tf.shape(label_weights)[0], tf.float32) print('==============') print(label_weights.shape) print('==============') loss = batch_size * loss return (loss, per_example_loss, log_probs)
def forward(x, segment, masks, y, reuse=False, config=bert_config): with tf.variable_scope('bert', reuse=reuse): model = modeling.BertModel( config=config, is_training=training, input_ids=x, input_mask=masks, token_type_ids=segment, use_one_hot_embeddings=False, ) memory = model.get_sequence_output() with tf.variable_scope('bert', reuse=True): Y_seq_len = tf.count_nonzero(y, 1, dtype=tf.int32) y_masks = tf.sequence_mask(Y_seq_len, tf.reduce_max(Y_seq_len), dtype=tf.float32) model = modeling_decoder.BertModel( config=config, is_training=training, input_ids=y, input_mask=y_masks, memory=memory, memory_mask=masks, use_one_hot_embeddings=False, ) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() with tf.variable_scope('cls/predictions', reuse=reuse): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units=config.hidden_size, activation=modeling.get_activation( bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b=True) return logits
def bert_module_fn(is_training): """Spec function for a token embedding module.""" input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids") bert_config = modeling.BertConfig.from_json_file(config_path) model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file") vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file") lower_case = tf.constant(do_lower_case) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) input_map = {"input_ids": input_ids} output_map = {"logits": logits} output_info_map = {"vocab_file": vocab_file, "do_lower_case": lower_case} hub.add_signature(name="tokens", inputs=input_map, outputs=output_map) hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
def get_next_sentence_output(bert_config, input_tensor): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, bert_config.hidden_size], initializer=modeling.create_initializer(bert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) next_sentence_probs = tf.nn.softmax(logits, axis=-1) return next_sentence_probs
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids): """Get loss and log probs for the masked LM.""" print("input tensor before gather_indexes:", input_tensor) input_tensor = gather_indexes(input_tensor, positions) print("input tensor before gather_indexes:", input_tensor) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) print(label_ids) label_ids = tf.reshape(label_ids, [-1]) print(label_ids) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) print(one_hot_labels) print(log_probs) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) print(per_example_loss) loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]]) print('positions: ', positions) print('loss', loss) # TODO: dynamic gather from per_example_loss??? return loss
def get_next_sentence_output(bert_config, input_tensor, labels): """Get loss and log probs for the next sentence prediction.""" # Simple binary classification. Note that 0 is "next sentence" and 1 is # "random sentence". This weight matrix is not used after pre-training. with tf.variable_scope("cls/seq_relationship"): output_weights = tf.get_variable( "output_weights", shape=[2, bert_config.hidden_size], initializer=modeling.create_initializer(bert_config.initializer_range)) output_bias = tf.get_variable( "output_bias", shape=[2], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) labels = tf.reshape(labels, [-1]) one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, log_probs)
def run_one_hot_embeddings(one_hot_input_ids, config): """Extract only the word embeddings of the original BERT model.""" with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE): with tf.variable_scope("embeddings"): # branched from modeling.embedding_lookup embedding_table = tf.get_variable( name="word_embeddings", shape=[config.vocab_size, config.hidden_size], initializer=modeling.create_initializer( config.initializer_range)) flat_input_ids = tf.reshape(one_hot_input_ids, [-1, config.vocab_size]) output = tf.matmul(flat_input_ids, embedding_table) input_shape = modeling.get_shape_list(one_hot_input_ids) output = tf.reshape(output, input_shape[0:-1] + [config.hidden_size]) return (output, embedding_table)
def __init__(self, config, input_hidden, embedding_table): # Keep variable names the same as BERT with tf.variable_scope("cls"): with tf.variable_scope("predictions"): with tf.variable_scope("transform"): self.transformed_output = tf.layers.dense( input_hidden, config.hidden_size, activation=modeling.get_activation(config.hidden_act), kernel_initializer=modeling.create_initializer( config.initializer_range)) self.transformed_output = modeling.layer_norm( self.transformed_output) output_bias = tf.Variable(tf.zeros([config.vocab_size]), name="output_bias") self.final_output = tf.add( tf.matmul(self.transformed_output, tf.transpose(embedding_table)), output_bias) self.probs = tf.nn.softmax(self.final_output, name='token_probs')
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) masked_lm_probs = tf.nn.softmax(logits, axis=-1) trunc_masked_lm_probs, top_indices = tf.math.top_k(masked_lm_probs, k=truncation_factor, sorted=False) max_predictions_per_seq = positions.get_shape().as_list()[1] truncation_factor_ = top_indices.get_shape().as_list()[1] trunc_masked_lm_probs = tf.reshape( trunc_masked_lm_probs, [-1, max_predictions_per_seq, truncation_factor_]) top_indices = tf.reshape( top_indices, [-1, max_predictions_per_seq, truncation_factor_]) return trunc_masked_lm_probs, top_indices
def __init__( self, ): BERT_CONFIG = "PATH_TO/multi_cased_L-12_H-768_A-12/bert_config.json" bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG) self.X = tf.placeholder(tf.int32, [None, None]) model = modeling.BertModel( config=bert_config, is_training=False, input_ids=self.X, use_one_hot_embeddings=False, ) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() output_layer = tf.reshape(output_layer, [-1, bert_config.hidden_size]) with tf.variable_scope("cls/predictions"): with tf.variable_scope("transform"): input_tensor = tf.layers.dense( output_layer, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b=True) print("---") self.logits = tf.nn.bias_add(logits, output_bias)
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, mention_ids=None): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor #----------------------------------------------------------- # reader = pywrap_tensorflow.NewCheckpointReader("gs://cloud-tpu-checkpoints/bert/uncased_L-12_H-768_A-12/bert_model.ckpt") # var_to_shape_map = reader.get_variable_to_shape_map() # for key in var_to_shape_map: # if key == "bert/embeddings/position_embeddings": # position_embedding_value = reader.get_tensor(key) # Remove this is you want to print only variable names # position_embedding_512value = np.array(position_embedding_value[511] * np.ones([512, 1]), dtype=np.float32) #----------------------------------------------------------- if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if mention_ids is not None: token_type_table = tf.get_variable(name='mention_marker', shape=[1, width], initializer=tf.zeros_initializer()) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(mention_ids, [-1, 1]) flat_token_type_ids = tf.cast(flat_token_type_ids, tf.float32) token_type_embeddings = tf.matmul(flat_token_type_ids, token_type_table) #one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) #token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings #init every 512 seperately with bert-base model if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): #if seq_length<=512: if seq_length <= 512: full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) elif seq_length <= 1024: full_position_embeddings_former = tf.get_variable( name=position_embedding_name + "_former", shape=[512, width], initializer=create_initializer(initializer_range)) full_position_embeddings_latter = tf.get_variable( name=position_embedding_name + "_latter", shape=[512, width], initializer=create_initializer(initializer_range)) #initializer=position_embedding_512value) #full_position_embeddings_latter += position_embedding_512value full_position_embeddings_latter = tf.slice( full_position_embeddings_latter, [0, 0], [seq_length - 512, -1]) position_embeddings = tf.concat( [ full_position_embeddings_former, full_position_embeddings_latter ], 0, name="large_window_size_position_embeddings") else: full_position_embeddings_first = tf.get_variable( name=position_embedding_name + "_first", shape=[512, width], initializer=create_initializer(initializer_range)) full_position_embeddings_second = tf.get_variable( name=position_embedding_name + "_second", shape=[512, width], initializer=create_initializer(initializer_range)) #initializer=position_embedding_512value) full_position_embeddings_third = tf.get_variable( name=position_embedding_name + "_third", shape=[512, width], initializer=create_initializer(initializer_range)) #initializer=position_embedding_512value) full_position_embeddings_third = tf.slice( full_position_embeddings_third, [0, 0], [seq_length - 1024, -1]) position_embeddings = tf.concat( [ full_position_embeddings_first, full_position_embeddings_second, full_position_embeddings_third ], 0, name="large_window_size_position_embeddings") num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def __init__(self, config, use_one_hot_embeddings=True, num_labels=2, max_seq_length=128): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is much faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ self.input_ids = tf.placeholder(dtype=tf.int32, shape=(None, max_seq_length)) self.input_mask = tf.placeholder(dtype=tf.int8, shape=(None, max_seq_length)) config = copy.deepcopy(config) input_shape = modeling.get_shape_list(self.input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope("bert", reuse=tf.AUTO_REUSE): with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=self.input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( self.input_ids, self.input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler", reuse=tf.AUTO_REUSE): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) # define output_weights and output_bias hidden_size = self.pooled_output.shape[-1].value with tf.variable_scope("", reuse=tf.AUTO_REUSE): self.output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) self.output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer())
def __init__(self, bert_config, tokenizer, cls, sep): _graph = tf.Graph() with _graph.as_default(): self.X = tf.placeholder(tf.int32, [None, None]) self.top_p = tf.placeholder(tf.float32, None) self.top_k = tf.placeholder(tf.int32, None) self.k = tf.placeholder(tf.int32, None) self.temperature = tf.placeholder(tf.float32, None) self.indices = tf.placeholder(tf.int32, [None, None]) self._tokenizer = tokenizer self._cls = cls self._sep = sep self.model = modeling.BertModel( config = bert_config, is_training = False, input_ids = self.X, use_one_hot_embeddings = False, ) self.logits = self.model.get_pooled_output() output_layer = self.model.get_sequence_output() embedding = self.model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units = bert_config.hidden_size, activation = modeling.get_activation( bert_config.hidden_act ), kernel_initializer = modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape = [bert_config.vocab_size], initializer = tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b = True) self._logits = tf.nn.bias_add(logits, output_bias) self._log_softmax = tf.nn.log_softmax(self._logits) logits = tf.gather_nd(self._logits, self.indices) logits = logits / self.temperature def necleus(): return top_p_logits(logits, self.top_p) def select_k(): return top_k_logits(logits, self.top_k) logits = tf.cond(self.top_p > 0, necleus, select_k) self.samples = tf.multinomial( logits, num_samples = self.k, output_dtype = tf.int32 ) self._sess = tf.InteractiveSession() self._sess.run(tf.global_variables_initializer()) var_lists = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert' ) cls = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls' ) self._saver = tf.train.Saver(var_list = var_lists + cls) attns = _extract_attention_weights( bert_config.num_hidden_layers, tf.get_default_graph() ) self.attns = attns
def __init__(self, config, is_training, input_ids, image_embeddings, input_mask=None, token_type_ids=None, use_one_hot_embeddings=False, scope=None): """Constructor for a visually grounded BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2) batch_size = text_input_shape[0] text_seq_length = text_input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, text_seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, text_seq_length], dtype=tf.int32) with tf.variable_scope(scope, default_name="bert"): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) # Add image embeddings the rest of the input embeddings. self.embedding_output += tf.layers.dense( image_embeddings, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( self.embedding_output, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range))
def build_attn_layers(self, input_tensor, attn_mask_concat, intermediate_size=2048, intermediate_act_fn=modeling.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): """See `attention_layer` defined in `bert/modeling.py`""" if not self.is_training: hidden_dropout_prob = 0.0 attention_probs_dropout_prob = 0.0 # input tensor shape: [batch, arg_length, BERT_hidden_size] # for example, using default hparams vals: [64, 128, 768] attention_head_size = int(self.hidden_size / self.num_attention_heads) input_shape = modeling.get_shape_list(input_tensor, expected_rank=3) prev_output = input_tensor attention_type_split = self.attention_type.split("_") all_layer_outputs = [] for layer_idx in range(self.num_hidden_layers): with tf.variable_scope(f"layer_{layer_idx}"): layer_input = prev_output if len(attention_type_split) == 3: indexer = layer_idx % 2 else: # len(attention_type_split) == 2: indexer = 0 layer_attn_type = attention_type_split[indexer] tf.logging.info( f"{layer_attn_type.capitalize()} Attention at {layer_idx}th Layer") attention_heads = [] with tf.variable_scope(f"{layer_attn_type}_attn"): attention_head = self.build_attn_layer( input_tensor=input_tensor, attn_mask_concat=attn_mask_concat, layer_attn_type=layer_attn_type, num_attention_heads=self.num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=False ) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, self.hidden_size, kernel_initializer=modeling.create_initializer(initializer_range)) attention_output = modeling.dropout(attention_output, hidden_dropout_prob) attention_output = modeling.layer_norm(attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=modeling.create_initializer(initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output = tf.layers.dense( intermediate_output, self.hidden_size, kernel_initializer=modeling.create_initializer(initializer_range)) layer_output = modeling.dropout(layer_output, hidden_dropout_prob) layer_output = modeling.layer_norm(layer_output + attention_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: final_outputs = [] for layer_output in all_layer_outputs: final_output = modeling.reshape_from_matrix(layer_output, input_shape) final_outputs.append(final_output) return final_outputs else: final_output = modeling.reshape_from_matrix(prev_output, input_shape) return final_output
def __init__(self, config, is_training, input_tensor, input_mask, token_type_ids): """Constructor for BertFlexEmbeddingModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_tensor: float32 Tensor of shape [batch_size, seq_length, hidden_size]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE): with tf.variable_scope("embeddings"): # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=input_tensor, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( input_tensor, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range))
def __call__(self, features, hidden_feature, mode, problem_name): """Get loss and log probs for the masked LM. DO NOT CHANGE THE VARAIBLE SCOPE. """ seq_hidden_feature = hidden_feature['seq'] positions = features['masked_lm_positions'] input_tensor = gather_indexes(seq_hidden_feature, positions) output_weights = hidden_feature['embed_table'] label_ids = features['masked_lm_ids'] label_weights = features['masked_lm_weights'] with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=self.params.mask_lm_hidden_size, activation=modeling.get_activation( self.params.mask_lm_hidden_act), kernel_initializer=modeling.create_initializer( self.params.mask_lm_initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[self.params.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.logits = logits log_probs = tf.nn.log_softmax(logits, axis=-1) if mode == tf.estimator.ModeKeys.PREDICT: self.prob = log_probs return self.prob else: label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=self.params.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = - \ tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator if mode == tf.estimator.ModeKeys.TRAIN: self.loss = loss return self.loss else: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape( masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, } eval_metrics = (metric_fn(per_example_loss, log_probs, label_ids, label_weights), loss) self.eval_metrics = eval_metrics return self.eval_metrics
def build(self, scope=None): if not self.is_training: self.hidden_dropout_prob = 0.0 self.attention_probs_dropout_prob = 0.0 with tf.variable_scope(scope, default_name="attentional_model"): self.build_input_pipeline() if self.is_bert_embedding: input_concat = self.embedding.get_bert_arg() mask_concat = self.embedding.get_attn_mask() else: self.embedding_table = self.init_embedding(self.embedding_placeholder) # embedding lookup with tf.variable_scope("embedding"): arg1 = tf.nn.embedding_lookup(self.embedding_table, self.arg1) arg2 = tf.nn.embedding_lookup(self.embedding_table, self.arg2) input_concat = tf.concat([arg1, arg2], axis=1) mask_concat = tf.concat([self.arg1_attn_mask, self.arg2_attn_mask], axis=1) # if word_vector_width and hidden_size do not match, need to project if self.word_vector_width != self.hidden_size: with tf.variable_scope("bert_projection"): input_concat = tf.layers.dense( name="dense", inputs=input_concat, units=self.hidden_size, kernel_initializer=tf.truncated_normal_initializer(stddev=0.02), use_bias=False ) # with tf.variable_scope("embedding_postprocess"): # # # if not self.is_finetunable_bert_embedding: # # additional context encoding with segment_ids and positional encoding # # ONLY when BERT is not being fine-tuned # batch_size = modeling.get_shape_list(input_concat, expected_rank=3)[0] # segment_ids = tf.concat([ # tf.zeros([batch_size, self.max_arg_length], dtype=tf.int32), # tf.ones([batch_size, self.max_arg_length], dtype=tf.int32) # ], axis=1) # # input_concat = self.encode_concat_context( # input_concat, segment_ids, # hidden_dropout_prob=self.hidden_dropout_prob, use_segment_ids=True, # use_position_embedding=True) with tf.variable_scope("encoder"): # attention layers, for now keeping all encoder layers self.all_encoder_layers = \ self.build_attn_layers(input_concat, attn_mask_concat=mask_concat, hidden_dropout_prob=self.hidden_dropout_prob, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] with tf.variable_scope("pooler"): # see `CLS_ACTIONS` defined in `const.py` pooled_tensor = self.apply_cls_pooling_fn(self.sequence_output) pooled_output = tf.layers.dense( pooled_tensor, self.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer() ) logits = self.build_loss_op(pooled_output) self.preds = tf.cast(tf.argmax(logits, axis=-1), tf.int32, name="preds") self.correct = tf.cast(tf.equal(self.preds, self.label), "float", name="correct") self.acc = tf.reduce_mean(self.correct, name="acc") tf.summary.scalar('loss', self.loss) tf.summary.scalar('accuracy', self.acc) self.train_op = self.build_train_op()
def encode_concat_context(self, input_tensor, segment_ids, segment_vocab_size=16, max_position_embeddings=512, hidden_dropout_prob=0.1, initializer_range=0.02, use_segment_ids=False, use_position_embedding=False): """See `embedding_postprocessor` defined in `bert/modeling.py`""" input_shape = modeling.get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_segment_ids: segment_table = tf.get_variable( name="segment_embeddings", shape=[segment_vocab_size, width], initializer=modeling.create_initializer(initializer_range)) flat_segment_ids = tf.reshape(segment_ids, [-1]) # flatten one_hot_ids = tf.one_hot(flat_segment_ids, depth=segment_vocab_size) segment_embeddings = tf.matmul(one_hot_ids, segment_table) segment_embeddings = tf.reshape(segment_embeddings, [batch_size, seq_length, width]) output += segment_embeddings if use_position_embedding: position_embeddings = tf.get_variable( name="position_embeddings", shape=[max_position_embeddings, width], initializer=modeling.create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = modeling.layer_norm_and_dropout(output, hidden_dropout_prob) return output
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1, mention_ids=None): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if mention_ids is not None: token_type_table = tf.get_variable(name='mention_marker', shape=[1, width], initializer=tf.zeros_initializer()) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(mention_ids, [-1, 1]) flat_token_type_ids = tf.cast(flat_token_type_ids, tf.float32) token_type_embeddings = tf.matmul(flat_token_type_ids, token_type_table) #one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) #token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output