def get_masked_lm_output(bert_config, input_tensor, output_weights, positions): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def __init__(self, config, input_embedding, input_mask=None): input_shape = modeling.get_shape_list(input_embedding, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) # Keep variable names the same as BERT with tf.variable_scope("bert"): with tf.variable_scope("encoder"): attention_mask = modeling.create_attention_mask_from_input_mask( input_embedding, input_mask) all_encoder_layers = modeling.transformer_model( input_tensor=input_embedding, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = all_encoder_layers[-1]
def __init__( self, ): self.X = tf.placeholder(tf.int32, [None, None]) model = modeling.BertModel( config=bert_config, is_training=False, input_ids=self.X, use_one_hot_embeddings=False) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units = bert_config.hidden_size, activation = modeling.get_activation(bert_config.hidden_act), kernel_initializer = modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape = [bert_config.vocab_size], initializer = tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b = True) self.logits = tf.nn.bias_add(logits, output_bias)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, top_k_indices, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs_student = tf.nn.log_softmax(logits, axis=-1) probs_student = tf.nn.softmax(logits, axis=-1) prob_shape = tf.shape(log_probs_student) new_shape = [prob_shape[0], truncation_factor] #[batch_size*seq_len,truncation_factor] top_k_indices = tf.reshape(top_k_indices, new_shape) top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices) top_k_probs_student = tf.batch_gather(probs_student, top_k_indices) return top_k_log_probs_student, top_k_probs_student
def get_masked_lm_output(self): self.input_tensor = self.gather_indexes() with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): self.input_tensor = tf.layers.dense( self.input_tensor, units=self.bert_config.hidden_size, activation=modeling.get_activation( self.bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self.bert_config.initializer_range)) self.input_tensor = modeling.layer_norm(self.input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[self.bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(self.input_tensor, self.output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) flat_masked_lm_ids = tf.reshape(self.masked_lm_ids, [-1]) one_hot_labels = tf.one_hot(flat_masked_lm_ids, depth=self.bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # TODO: dynamic gather from per_example_loss??? loss = tf.reshape(per_example_loss, [-1, tf.shape(self.masked_lm_positions)[1]]) return loss
def compute_image_transformer( self, input_ids, input_image, input_image_mask, input_positions, reuse=None, ): """Build the image transformer.""" with tf.variable_scope(self.scope_prefix + "transformer", reuse=reuse): with tf.variable_scope("bridge"): image_emb = tf.layers.dense( inputs=input_image, units=self.config.hidden_size, activation=tf.nn.relu, kernel_initializer=modeling.create_initializer( self.config.initializer_range), reuse=reuse) with tf.variable_scope("embeddings"): input_emb = tf.gather(self.embedding_table, input_ids) image_emb = tf.concat([input_emb, image_emb], axis=1) batch_size = tensor_utils.shape(image_emb, 0) sequence_length = tensor_utils.shape(image_emb, 1) position_emb = tf.gather(self.image_region_table, input_positions) position_emb = tf.pad(position_emb, [[0, 0], [1, 0], [0, 0]]) input_order = tf.range(tensor_utils.shape(image_emb, 1)) input_order = tf.tile(tf.expand_dims(input_order, 0), [tensor_utils.shape(image_emb, 0), 1]) order_emb = tf.gather(self.image_order_table, input_order) input_segment_id = tf.fill([batch_size, sequence_length], self.IMG) segment_emb = tf.gather(self.segment_table, input_segment_id) input_emb = image_emb + position_emb + order_emb + segment_emb input_emb = modeling.layer_norm_and_dropout( input_emb, self.config.hidden_dropout_prob) with tf.variable_scope("image/encoder"): sequence_output, output_cache = compute_transformer( input_tensor=input_emb, attention_mask=tf.expand_dims(input_image_mask, 1), hidden_size=self.config.hidden_size, num_hidden_layers=self.config.num_hidden_layers, num_attention_heads=self.config.num_attention_heads, intermediate_size=self.config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.config.hidden_act), hidden_dropout_prob=self.config.hidden_dropout_prob, attention_probs_dropout_prob=( self.config.attention_probs_dropout_prob), initializer_range=self.config.initializer_range, input_cache=None) return sequence_output, output_cache
def build_bert_model(self, input_ids, input_mask, token_type_ids): with tf.variable_scope('bert'): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (embedding_output, _) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=self.bert_config.vocab_size, embedding_size=self.bert_config.hidden_size, initializer_range=self.bert_config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. embedding_output = modeling.embedding_postprocessor( input_tensor=embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=self.bert_config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=self.bert_config.initializer_range, max_position_embeddings=self.bert_config. max_position_embeddings, dropout_prob=self.bert_config.hidden_dropout_prob) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( input_ids, input_mask) # Run the stacked transformer, only fetching the final lyaer # `final_layer` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=embedding_output, attention_mask=attention_mask, hidden_size=self.bert_config.hidden_size, num_hidden_layers=self.bert_config.num_hidden_layers, num_attention_heads=self.bert_config.num_attention_heads, intermediate_size=self.bert_config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.bert_config.hidden_act), hidden_dropout_prob=self.bert_config.hidden_dropout_prob, attention_probs_dropout_prob=\ self.bert_config.attention_probs_dropout_prob, initializer_range=self.bert_config.initializer_range, do_return_all_layers=True ) self.sequence_output = self.all_encoder_layers[-1]
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_weights, truncated_masked_lm_probs_teacher, top_k_indices, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs_student = tf.nn.log_softmax(logits, axis=-1) label_weights = tf.reshape(label_weights, [-1]) prob_shape = tf.shape(log_probs_student) new_shape = [prob_shape[0], truncation_factor ] #[batch_size*seq_len,truncation_factor] top_k_indices = tf.reshape(top_k_indices, new_shape) top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices) truncated_masked_lm_probs_teacher = tf.reshape( truncated_masked_lm_probs_teacher, new_shape) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum( truncated_masked_lm_probs_teacher * top_k_log_probs_student, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs_student)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights_flat = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights_flat * per_example_loss) denominator = tf.reduce_sum(label_weights_flat) + 1e-5 loss = numerator / denominator batch_size = tf.cast(tf.shape(label_weights)[0], tf.float32) print('==============') print(label_weights.shape) print('==============') loss = batch_size * loss return (loss, per_example_loss, log_probs)
def build(self, unused_input_shapes): """Implements build() for the layer.""" self.lm_dense = tf.keras.layers.Dense( self.config.hidden_size, activation=modeling.get_activation(self.config.hidden_act), kernel_initializer=self.initializer) self.lm_bias = self.add_weight( shape=[self.config.vocab_size], name='lm_bias', initializer=tf.keras.initializers.Zeros()) self.lm_layer_norm = tf.keras.layers.LayerNormalization( axis=-1, epsilon=1e-12) self.next_sentence_dense = tf.keras.layers.Dense( self.num_next_sentence_label, kernel_initializer=self.initializer) super(BertPretrainLayer, self).build(unused_input_shapes)
def bert_module_fn(is_training): """Spec function for a token embedding module.""" input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids") bert_config = modeling.BertConfig.from_json_file(config_path) model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file") vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file") lower_case = tf.constant(do_lower_case) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) input_map = {"input_ids": input_ids} output_map = {"logits": logits} output_info_map = {"vocab_file": vocab_file, "do_lower_case": lower_case} hub.add_signature(name="tokens", inputs=input_map, outputs=output_map) hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
def forward(x, segment, masks, y, reuse=False, config=bert_config): with tf.variable_scope('bert', reuse=reuse): model = modeling.BertModel( config=config, is_training=training, input_ids=x, input_mask=masks, token_type_ids=segment, use_one_hot_embeddings=False, ) memory = model.get_sequence_output() with tf.variable_scope('bert', reuse=True): Y_seq_len = tf.count_nonzero(y, 1, dtype=tf.int32) y_masks = tf.sequence_mask(Y_seq_len, tf.reduce_max(Y_seq_len), dtype=tf.float32) model = modeling_decoder.BertModel( config=config, is_training=training, input_ids=y, input_mask=y_masks, memory=memory, memory_mask=masks, use_one_hot_embeddings=False, ) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() with tf.variable_scope('cls/predictions', reuse=reuse): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units=config.hidden_size, activation=modeling.get_activation( bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b=True) return logits
def compute_logits(self, target_emb, reuse=None): """Compute logits for word prediction.""" with tf.variable_scope(self.scope_prefix + "cls/predictions", reuse=reuse): with tf.variable_scope("transform"): target_emb = tf.layers.dense( target_emb, units=self.config.hidden_size, activation=modeling.get_activation(self.config.hidden_act), kernel_initializer=self.initializer) target_emb = modeling.layer_norm(target_emb) output_bias = tf.get_variable("output_bias", shape=[self.config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(target_emb, self.embedding_table, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids): """Get loss and log probs for the masked LM.""" print("input tensor before gather_indexes:", input_tensor) input_tensor = gather_indexes(input_tensor, positions) print("input tensor before gather_indexes:", input_tensor) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) print(label_ids) label_ids = tf.reshape(label_ids, [-1]) print(label_ids) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) print(one_hot_labels) print(log_probs) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) print(per_example_loss) loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]]) print('positions: ', positions) print('loss', loss) # TODO: dynamic gather from per_example_loss??? return loss
def __init__(self, config, input_hidden, embedding_table): # Keep variable names the same as BERT with tf.variable_scope("cls"): with tf.variable_scope("predictions"): with tf.variable_scope("transform"): self.transformed_output = tf.layers.dense( input_hidden, config.hidden_size, activation=modeling.get_activation(config.hidden_act), kernel_initializer=modeling.create_initializer( config.initializer_range)) self.transformed_output = modeling.layer_norm( self.transformed_output) output_bias = tf.Variable(tf.zeros([config.vocab_size]), name="output_bias") self.final_output = tf.add( tf.matmul(self.transformed_output, tf.transpose(embedding_table)), output_bias) self.probs = tf.nn.softmax(self.final_output, name='token_probs')
def compute_transformer( self, input_ids, input_segment_id, input_positions, attention_mask, input_cache=None, reuse=None, conditional=False, ): """Build the full text transformer.""" with tf.variable_scope(self.scope_prefix + "transformer", reuse=reuse): with tf.variable_scope("embeddings"): token_emb = tf.gather(self.embedding_table, input_ids) segment_embed = tf.gather(self.segment_table, input_segment_id) if conditional: position_table = self.condition_position_table else: position_table = self.position_table position_emb = tf.gather(position_table, input_positions) input_emb = token_emb + segment_embed + position_emb input_emb = modeling.layer_norm_and_dropout( input_emb, self.config.hidden_dropout_prob) with tf.variable_scope("encoder"): sequence_output, output_cache = compute_transformer( input_tensor=input_emb, attention_mask=attention_mask, hidden_size=self.config.hidden_size, num_hidden_layers=self.config.num_hidden_layers, num_attention_heads=self.config.num_attention_heads, intermediate_size=self.config.intermediate_size, intermediate_act_fn=modeling.get_activation( self.config.hidden_act), hidden_dropout_prob=self.config.hidden_dropout_prob, attention_probs_dropout_prob=( self.config.attention_probs_dropout_prob), initializer_range=self.config.initializer_range, input_cache=input_cache) return sequence_output, output_cache
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) masked_lm_probs = tf.nn.softmax(logits, axis=-1) trunc_masked_lm_probs, top_indices = tf.math.top_k(masked_lm_probs, k=truncation_factor, sorted=False) max_predictions_per_seq = positions.get_shape().as_list()[1] truncation_factor_ = top_indices.get_shape().as_list()[1] trunc_masked_lm_probs = tf.reshape( trunc_masked_lm_probs, [-1, max_predictions_per_seq, truncation_factor_]) top_indices = tf.reshape( top_indices, [-1, max_predictions_per_seq, truncation_factor_]) return trunc_masked_lm_probs, top_indices
def __init__( self, ): BERT_CONFIG = "PATH_TO/multi_cased_L-12_H-768_A-12/bert_config.json" bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG) self.X = tf.placeholder(tf.int32, [None, None]) model = modeling.BertModel( config=bert_config, is_training=False, input_ids=self.X, use_one_hot_embeddings=False, ) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() output_layer = tf.reshape(output_layer, [-1, bert_config.hidden_size]) with tf.variable_scope("cls/predictions"): with tf.variable_scope("transform"): input_tensor = tf.layers.dense( output_layer, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b=True) print("---") self.logits = tf.nn.bias_add(logits, output_bias)
def __init__( self, bert_config, input_ids, input_mask, token_type_ids, Y, is_training=True, ): self.X = input_ids self.segment_ids = token_type_ids self.input_masks = input_mask self.Y = Y self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32) self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32) batch_size = tf.shape(self.X)[0] model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=self.X, input_mask=self.input_masks, token_type_ids=self.segment_ids, use_one_hot_embeddings=False, ) print(bert_config.__dict__) BASE_PARAMS = defaultdict( lambda: None, default_batch_size=2048, default_batch_size_tpu=32768, max_length=bert_config.max_position_embeddings, initializer_gain=1.0, vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_heads=bert_config.num_attention_heads, filter_size=bert_config.intermediate_size, layer_postprocess_dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, label_smoothing=0.1, learning_rate=1.0, learning_rate_decay_rate=1.0, learning_rate_warmup_steps=16000, optimizer_adam_beta1=0.9, optimizer_adam_beta2=0.997, optimizer_adam_epsilon=1e-09, extra_decode_length=50, beam_size=4, alpha=0.6, use_tpu=False, static_batch=False, allow_ffn_pad=True, ) self.decoder_stack = DecoderStack(BASE_PARAMS, is_training) attention_bias = model_utils.get_padding_bias(self.X) output_layer = model.get_sequence_output() pooled_output = model.get_pooled_output() embedding = model.get_embedding_table() with tf.name_scope('decode'): mask = tf.to_float(tf.not_equal(self.Y, 0)) decoder_inputs = tf.gather(embedding, self.Y) decoder_inputs *= tf.expand_dims(mask, -1) with tf.name_scope('shift_targets'): decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope('add_pos_encoding'): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, BASE_PARAMS['hidden_size']) if is_training: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - BASE_PARAMS['layer_postprocess_dropout']) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, output_layer, decoder_self_attention_bias, attention_bias, ) with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( outputs, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) self.training_logits = tf.matmul(input_tensor, embedding, transpose_b=True) print(self.training_logits)
def __init__(self, bert_config, tokenizer, cls, sep): _graph = tf.Graph() with _graph.as_default(): self.X = tf.placeholder(tf.int32, [None, None]) self.top_p = tf.placeholder(tf.float32, None) self.top_k = tf.placeholder(tf.int32, None) self.k = tf.placeholder(tf.int32, None) self.temperature = tf.placeholder(tf.float32, None) self.indices = tf.placeholder(tf.int32, [None, None]) self._tokenizer = tokenizer self._cls = cls self._sep = sep self.model = modeling.BertModel( config = bert_config, is_training = False, input_ids = self.X, use_one_hot_embeddings = False, ) self.logits = self.model.get_pooled_output() output_layer = self.model.get_sequence_output() embedding = self.model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units = bert_config.hidden_size, activation = modeling.get_activation( bert_config.hidden_act ), kernel_initializer = modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape = [bert_config.vocab_size], initializer = tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b = True) self._logits = tf.nn.bias_add(logits, output_bias) self._log_softmax = tf.nn.log_softmax(self._logits) logits = tf.gather_nd(self._logits, self.indices) logits = logits / self.temperature def necleus(): return top_p_logits(logits, self.top_p) def select_k(): return top_k_logits(logits, self.top_k) logits = tf.cond(self.top_p > 0, necleus, select_k) self.samples = tf.multinomial( logits, num_samples = self.k, output_dtype = tf.int32 ) self._sess = tf.InteractiveSession() self._sess.run(tf.global_variables_initializer()) var_lists = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert' ) cls = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls' ) self._saver = tf.train.Saver(var_list = var_lists + cls) attns = _extract_attention_weights( bert_config.num_hidden_layers, tf.get_default_graph() ) self.attns = attns
def __init__(self, config, is_training, input_ids, image_embeddings, input_mask=None, token_type_ids=None, use_one_hot_embeddings=False, scope=None): """Constructor for a visually grounded BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. image_embeddings: float32 Tensor of shape [batch_size, seq_length, depth]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 text_input_shape = modeling.get_shape_list(input_ids, expected_rank=2) batch_size = text_input_shape[0] text_seq_length = text_input_shape[1] if input_mask is None: input_mask = tf.ones(shape=[batch_size, text_seq_length], dtype=tf.int32) if token_type_ids is None: token_type_ids = tf.zeros(shape=[batch_size, text_seq_length], dtype=tf.int32) with tf.variable_scope(scope, default_name="bert"): with tf.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) # Add image embeddings the rest of the input embeddings. self.embedding_output += tf.layers.dense( image_embeddings, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( self.embedding_output, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range))
def __init__(self, config, is_training, input_tensor, input_mask, token_type_ids): """Constructor for BertFlexEmbeddingModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_tensor: float32 Tensor of shape [batch_size, seq_length, hidden_size]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ config = copy.deepcopy(config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 with tf.variable_scope("bert", reuse=tf.compat.v1.AUTO_REUSE): with tf.variable_scope("embeddings"): # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=input_tensor, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( input_tensor, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range))
def __init__( self, input_ids, input_mask, token_type_ids, Y, learning_rate=2e-5, is_training=True, ): self.X = input_ids self.segment_ids = token_type_ids self.input_masks = input_mask self.Y = Y self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32) self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32) batch_size = tf.shape(self.X)[0] model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=self.X, input_mask=self.input_masks, token_type_ids=self.segment_ids, use_one_hot_embeddings=False, ) self.decoder_stack = DecoderStack(BASE_PARAMS, is_training) attention_bias = model_utils.get_padding_bias(self.X) output_layer = model.get_sequence_output() pooled_output = model.get_pooled_output() embedding = model.get_embedding_table() with tf.name_scope('decode'): mask = tf.to_float(tf.not_equal(self.Y, 0)) decoder_inputs = tf.gather(embedding, self.Y) decoder_inputs *= tf.expand_dims(mask, -1) with tf.name_scope('shift_targets'): decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope('add_pos_encoding'): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, BASE_PARAMS['hidden_size']) if is_training: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - BASE_PARAMS['layer_postprocess_dropout']) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, output_layer, decoder_self_attention_bias, attention_bias, ) with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( outputs, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) self.training_logits = tf.matmul(input_tensor, embedding, transpose_b=True) print(self.training_logits)
def create_mask_model(bert_config, is_training, input_ids, input_mask, segment_ids, mask_positions, use_one_hot_embeddings): """Creates a classification model.""" #print("create mask model ----------------------------------------------") model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # Get the logits for the start and end predictions. final_hidden = model.get_sequence_output() final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) batch_size = final_hidden_shape[0] seq_length = final_hidden_shape[1] hidden_size = final_hidden_shape[2] output_weights = tf.get_variable( "cls/nq/output_weights", [2, hidden_size + 12], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("cls/nq/output_bias", [2], initializer=tf.zeros_initializer()) final_hidden_matrix = tf.reshape(final_hidden, [batch_size * seq_length, hidden_size]) mask_positions_matrix = tf.cast(tf.reshape(mask_positions, [batch_size * seq_length, 1]), dtype=tf.float32) padding = tf.zeros([batch_size * seq_length, 11], dtype=tf.float32) mask_positions_matrix = tf.concat([mask_positions_matrix, padding], axis=-1) final_hidden_matrix = tf.concat( [final_hidden_matrix, mask_positions_matrix], axis=-1) final_hidden_matrix = tf.reshape( final_hidden_matrix, [batch_size, seq_length, hidden_size + 12]) attention_mask = modeling.create_attention_mask_from_input_mask( input_ids, input_mask) config = bert_config all_encoder_layers = modeling.transformer_model( input_tensor=final_hidden_matrix, attention_mask=attention_mask, hidden_size=config.hidden_size + 12, # input hidden size num_hidden_layers=1, #config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config.attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) #print(all_encoder_layers.shape) transformer_output_matrix = all_encoder_layers[-1] transformer_output_matrix = tf.reshape( transformer_output_matrix, [batch_size * seq_length, hidden_size + 12]) logits = tf.matmul(transformer_output_matrix, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits = tf.reshape(logits, [batch_size, seq_length, 2]) logits = tf.transpose(logits, [2, 0, 1]) unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) # Get the logits for the answer type prediction. answer_type_output_layer = model.get_pooled_output() answer_type_hidden_size = answer_type_output_layer.shape[-1].value num_answer_types = 5 # YES, NO, UNKNOWN, SHORT, LONG answer_type_output_weights = tf.get_variable( "answer_type_output_weights", [num_answer_types, answer_type_hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) answer_type_output_bias = tf.get_variable( "answer_type_output_bias", [num_answer_types], initializer=tf.zeros_initializer()) answer_type_logits = tf.matmul(answer_type_output_layer, answer_type_output_weights, transpose_b=True) answer_type_logits = tf.nn.bias_add(answer_type_logits, answer_type_output_bias) return (start_logits, end_logits, answer_type_logits)
def main(args): bert_config = modeling.BertConfig.from_json_file(args.config) bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 batch_size = args.batch_size avg_seq_len = args.avg_seq_length max_seq_len = args.max_seq_length tf_dtype = tf.float16 if args.precision == 'fp16' else tf.float32 # fake input array length input_len = np.random.randint(low=2 * avg_seq_len - max_seq_len, high=max_seq_len + 1, size=(batch_size), dtype=np.int32) valid_word_num = sum(input_len) # fake input id and mask input_ids = np.random.randint(low=0, high=bert_config.vocab_size, size=(batch_size, max_seq_len), dtype=np.int32) input_mask = np.zeros((batch_size, max_seq_len), dtype=np.int32) for b_idx, s_len in enumerate(input_len): input_mask[b_idx][:s_len] = 1 input_ids_tensor = tf.convert_to_tensor(input_ids, dtype=tf.int32) input_mask_tensor = tf.convert_to_tensor(input_mask, dtype=tf.int32) # fake embedding output embed_output = np.random.randn(batch_size, max_seq_len, bert_config.hidden_size) input_tensor = tf.convert_to_tensor(embed_output, dtype=tf_dtype) # keep attention_mask for compatible reason att_mask = np.tile(input_mask, max_seq_len) att_mask = att_mask.reshape(batch_size, max_seq_len, max_seq_len) attention_mask = tf.convert_to_tensor(att_mask, dtype=tf_dtype) # input info valid_word_num = sum(input_len) print("Valid word num : {}/{}, avg sequence length : {:.6} ".format( valid_word_num, batch_size * max_seq_len, valid_word_num / batch_size)) # bert with standard transformer std_bert = modeling.transformer_model( input_tensor=input_tensor, attention_mask=attention_mask, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=modeling.get_activation(bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob, initializer_range=bert_config.initializer_range, do_return_all_layers=False) config = tf.ConfigProto() config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 with tf.Session(config=config) as sess: # init weights sess.run(tf.global_variables_initializer()) # get transformer weights all_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) transformer_vars = [v for v in all_vars if v.name.startswith('layer')] weights_value = sess.run(transformer_vars) # bert with effective transformer et_bert = effective_transformer.get_sequence_output( max_batch_size=batch_size, max_seq_length=max_seq_len, config=bert_config, attention_mask=attention_mask, input_mask=input_mask_tensor, from_tensor=input_tensor, weights_value=weights_value, ) # diff val1 = sess.run(std_bert).reshape(-1, 768) val2 = sess.run(et_bert).reshape(-1, 768) diff = [] for b_idx, s_len in enumerate(input_len): for w_idx in range(s_len): idx = b_idx * args.max_seq_length + w_idx diff.append(np.fabs(val1[idx] - val2[idx]).max()) print("max diff : {:.6}, avg diff : {:.6}.".format( max(diff), sum(diff) / len(diff))) def time_inference(output_tensor): iter_num = 128 # warm up for i in range(10): sess.run(output_tensor) beg = datetime.now() for i in range(iter_num): sess.run(output_tensor) end = datetime.now() return (end - beg).total_seconds() * 1000 / iter_num # ms print("xla cost : {:.6} ms".format(time_inference(std_bert))) print("et cost : {:.6} ms".format(time_inference(et_bert)))
def __call__(self, features, hidden_feature, mode, problem_name): """Get loss and log probs for the masked LM. DO NOT CHANGE THE VARAIBLE SCOPE. """ seq_hidden_feature = hidden_feature['seq'] positions = features['masked_lm_positions'] input_tensor = gather_indexes(seq_hidden_feature, positions) output_weights = hidden_feature['embed_table'] label_ids = features['masked_lm_ids'] label_weights = features['masked_lm_weights'] with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=self.params.mask_lm_hidden_size, activation=modeling.get_activation( self.params.mask_lm_hidden_act), kernel_initializer=modeling.create_initializer( self.params.mask_lm_initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[self.params.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.logits = logits log_probs = tf.nn.log_softmax(logits, axis=-1) if mode == tf.estimator.ModeKeys.PREDICT: self.prob = log_probs return self.prob else: label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=self.params.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = - \ tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator if mode == tf.estimator.ModeKeys.TRAIN: self.loss = loss return self.loss else: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape( masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, } eval_metrics = (metric_fn(per_example_loss, log_probs, label_ids, label_weights), loss) self.eval_metrics = eval_metrics return self.eval_metrics
def aggregate_embedding(embeddings, segment_idx, aggregator, config=None, aux=None, name=None): # segment_idx denotes different needles, rather than rows. if aggregator == 'segment_sqrt_n': denom = to_col( tf.sqrt( tf.to_float( tf.segment_sum(tf.ones_like(segment_idx), segment_idx)))) output_layer = tf.div_no_nan(tf.segment_sum(embeddings, segment_idx), denom, name=name) elif aggregator in ['segment_sum', 'segment_mean']: output_layer = getattr(tf, aggregator)(embeddings, segment_idx, name=name) else: del embeddings assert aggregator.startswith('transformer') flags = {} if '^' in aggregator: flags = [ kv.split('@') for kv in filter(None, aggregator.split('^')[1].split(',')) ] flags = {k: eval(v) for k, v in flags} assert config is not None and aux is not None needle_pos = aux['needle_pos'] embedding_output = aux['sequence_output'] batch_idx2 = aux['batch_idx2'] # different rows. is_training = aux['is_training'] attention_mask = get_dense_mask(needle_pos, batch_idx2, tf.shape(embedding_output)[:2]) with tf.variable_scope('final_transformer'): all_encoder_layers = modeling.transformer_model( input_tensor=embedding_output, attention_mask=attention_mask, hidden_size=config. hidden_size, # this must agree with input width. num_hidden_layers=flags.get('num_hidden_layers', 1), num_attention_heads=flags.get('num_attention_heads', config.num_attention_heads), intermediate_size=flags.get('intermediate_size', config.intermediate_size), intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=flags.get('hidden_dropout_prob', config.hidden_dropout_prob) * int(is_training), attention_probs_dropout_prob=int(is_training) * flags.get('attention_probs_dropout_prob', config.attention_probs_dropout_prob), initializer_range=config.initializer_range, do_return_all_layers=True) first_token_tensor = all_encoder_layers[-1][:, 0, :] output_layer = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) return output_layer
def __init__(self, config, use_one_hot_embeddings=True, num_labels=2, max_seq_length=128): """Constructor for BertModel. Args: config: `BertConfig` instance. is_training: bool. true for training model, false for eval model. Controls whether dropout will be applied. input_ids: int32 Tensor of shape [batch_size, seq_length]. input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. use_one_hot_embeddings: (optional) bool. Whether to use one-hot word embeddings or tf.embedding_lookup() for the word embeddings. On the TPU, it is much faster if this is True, on the CPU or GPU, it is faster if this is False. scope: (optional) variable scope. Defaults to "bert". Raises: ValueError: The config is invalid or one of the input tensor shapes is invalid. """ self.input_ids = tf.placeholder(dtype=tf.int32, shape=(None, max_seq_length)) self.input_mask = tf.placeholder(dtype=tf.int8, shape=(None, max_seq_length)) config = copy.deepcopy(config) input_shape = modeling.get_shape_list(self.input_ids, expected_rank=2) batch_size = input_shape[0] seq_length = input_shape[1] token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) with tf.variable_scope("bert", reuse=tf.AUTO_REUSE): with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = modeling.embedding_lookup( input_ids=self.input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = modeling.embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = modeling.create_attention_mask_from_input_mask( self.input_ids, self.input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = modeling.transformer_model( input_tensor=self.embedding_output, attention_mask=attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation( config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) self.sequence_output = self.all_encoder_layers[-1] # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level # (or segment-pair-level) classification tasks where we need a fixed # dimensional representation of the segment. with tf.variable_scope("pooler", reuse=tf.AUTO_REUSE): # We "pool" the model by simply taking the hidden state corresponding # to the first token. We assume that this has been pre-trained first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) self.pooled_output = tf.layers.dense( first_token_tensor, config.hidden_size, activation=tf.tanh, kernel_initializer=modeling.create_initializer( config.initializer_range)) # define output_weights and output_bias hidden_size = self.pooled_output.shape[-1].value with tf.variable_scope("", reuse=tf.AUTO_REUSE): self.output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) self.output_bias = tf.get_variable( "output_bias", [num_labels], initializer=tf.zeros_initializer())
def create_bilstm_classification_model(bert_config, is_training, response_input_ids, response_input_mask, response_segment_ids, response_text_len, response_labels, random_forward_input_ids, random_forward_input_mask, random_forward_segment_ids, random_forward_text_len, random_backward_input_ids, random_backward_input_mask, random_backward_segment_ids, random_backward_text_len, random_labels, swap_forward_input_ids, swap_forward_input_mask, swap_forward_segment_ids, swap_forward_text_len, swap_backward_input_ids, swap_backward_input_mask, swap_backward_segment_ids, swap_backward_text_len, swap_labels, nli_forward_input_ids, nli_forward_input_mask, nli_forward_segment_ids, nli_forward_text_len, nli_backward_input_ids, nli_backward_input_mask, nli_backward_segment_ids, nli_backward_text_len, nli_labels, num_nli_labels, use_one_hot_embeddings, l2_reg_lambda=0.1, dropout_rate=1.0, lstm_size=None, num_layers=1): config = copy.deepcopy(bert_config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 with tf.variable_scope("bert", reuse=tf.AUTO_REUSE): with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): (response_embedding_output, response_embedding_table) = modeling.embedding_lookup( input_ids=response_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) response_embedding_output = modeling.embedding_postprocessor( input_tensor=response_embedding_output, use_token_type=not config.roberta, token_type_ids=response_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # random detection # Perform embedding lookup on the word ids. (random_foward_embedding_output, random_forward_embedding_table) = modeling.embedding_lookup( input_ids=random_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Perform embedding lookup on the word ids. (random_backward_embedding_output, random_backward_embedding_table) = modeling.embedding_lookup( input_ids=random_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. random_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) random_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # swap detection (swap_foward_embedding_output, swap_forward_embedding_table) = modeling.embedding_lookup( input_ids=swap_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (swap_backward_embedding_output, swap_backward_embedding_table) = modeling.embedding_lookup( input_ids=swap_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) swap_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) swap_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # # generic detection # (generic_foward_embedding_output, generic_forward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_forward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # (generic_backward_embedding_output, generic_backward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_backward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # generic_foward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_foward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_forward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # generic_backward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_backward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_backward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # nli detection (nli_foward_embedding_output, nli_forward_embedding_table) = modeling.embedding_lookup( input_ids=nli_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (nli_backward_embedding_output, nli_backward_embedding_table) = modeling.embedding_lookup( input_ids=nli_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) nli_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) nli_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): response_attention_mask = modeling.create_attention_mask_from_input_mask( response_input_ids, response_input_mask) # [batch_size, from_seq_length, to_seq_length] # mask future tokens diag_vals = tf.ones_like(response_attention_mask[0, :, :]) tril = tf.linalg.LinearOperatorLowerTriangular( diag_vals).to_dense() future_masks = tf.tile(tf.expand_dims( tril, 0), [tf.shape(response_attention_mask)[0], 1, 1]) response_attention_mask = tf.math.multiply(response_attention_mask, future_masks) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. response_all_encoder_layers = modeling.transformer_model( input_tensor=response_embedding_output, attention_mask=response_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # random detection # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. random_forward_attention_mask = modeling.create_attention_mask_from_input_mask( random_forward_input_ids, random_forward_input_mask) random_backward_attention_mask = modeling.create_attention_mask_from_input_mask( random_backward_input_ids, random_backward_input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. random_forward_all_encoder_layers = modeling.transformer_model( input_tensor=random_foward_embedding_output, attention_mask=random_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_backward_all_encoder_layers = modeling.transformer_model( input_tensor=random_backward_embedding_output, attention_mask=random_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # swap detection swap_forward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_forward_input_ids, swap_forward_input_mask) swap_backward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_backward_input_ids, swap_backward_input_mask) swap_forward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_foward_embedding_output, attention_mask=swap_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) swap_backward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_backward_embedding_output, attention_mask=swap_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # # generic detection # generic_forward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_forward_input_ids, # generic_forward_input_mask) # generic_backward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_backward_input_ids, # generic_backward_input_mask) # generic_forward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_foward_embedding_output, # attention_mask=generic_forward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # generic_backward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_backward_embedding_output, # attention_mask=generic_backward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # nli detection nli_forward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_forward_input_ids, nli_forward_input_mask) nli_backward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_backward_input_ids, nli_backward_input_mask) nli_forward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_foward_embedding_output, attention_mask=nli_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) nli_backward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_backward_embedding_output, attention_mask=nli_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_forward_embedding = random_forward_all_encoder_layers[-2] random_backward_embedding = random_backward_all_encoder_layers[-2] swap_forward_embedding = swap_forward_all_encoder_layers[-2] swap_backward_embedding = swap_backward_all_encoder_layers[-2] # generic_forward_embedding = generic_forward_all_encoder_layers[-2] # generic_backward_embedding = generic_backward_all_encoder_layers[-2] nli_forward_embedding = nli_forward_all_encoder_layers[-2] nli_backward_embedding = nli_backward_all_encoder_layers[-2] response_embedding = response_all_encoder_layers[-2] response_embedding_shape = modeling.get_shape_list(response_embedding, expected_rank=3) with tf.variable_scope("lm_head", reuse=tf.AUTO_REUSE): response_logits = tf.layers.dense(response_embedding, config.hidden_size, activation=None) response_logits = modeling.gelu(response_logits) response_logits = modeling.layer_norm(response_logits) response_outputs = tf.layers.dense( response_logits, config.vocab_size, activation=None, use_bias=True, bias_initializer=tf.zeros_initializer()) response_one_hot = tf.one_hot(response_labels, depth=config.vocab_size, dtype=tf.float32) lm_cost = tf.nn.softmax_cross_entropy_with_logits( labels=response_one_hot, logits=response_outputs) sequence_mask = tf.sequence_mask(response_text_len, maxlen=response_embedding_shape[1], dtype=tf.float32) masked_lm_cost = tf.math.multiply(lm_cost, sequence_mask) final_lm_loss = tf.reduce_mean( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) perplexity = tf.exp( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) random_forward_embedding_shape = modeling.get_shape_list( random_forward_embedding, expected_rank=3) random_backward_embedding_shape = modeling.get_shape_list( random_backward_embedding, expected_rank=3) assert random_forward_embedding_shape[ 2] == random_backward_embedding_shape[2] random_forward_embedding = tf.transpose(random_forward_embedding, [1, 0, 2]) random_backward_embedding = tf.transpose(random_backward_embedding, [1, 0, 2]) random_forward_input_mask = tf.cast( tf.transpose(random_forward_input_mask, [1, 0]), tf.float32) random_backward_input_mask = tf.cast( tf.transpose(random_backward_input_mask, [1, 0]), tf.float32) swap_forward_embedding_shape = modeling.get_shape_list( swap_forward_embedding, expected_rank=3) swap_backward_embedding_shape = modeling.get_shape_list( swap_backward_embedding, expected_rank=3) assert swap_forward_embedding_shape[2] == swap_backward_embedding_shape[2] swap_forward_embedding = tf.transpose(swap_forward_embedding, [1, 0, 2]) swap_backward_embedding = tf.transpose(swap_backward_embedding, [1, 0, 2]) swap_forward_input_mask = tf.cast( tf.transpose(swap_forward_input_mask, [1, 0]), tf.float32) swap_backward_input_mask = tf.cast( tf.transpose(swap_backward_input_mask, [1, 0]), tf.float32) # generic_forward_embedding_shape = modeling.get_shape_list(generic_forward_embedding, expected_rank=3) # generic_backward_embedding_shape = modeling.get_shape_list(generic_backward_embedding, expected_rank=3) # assert generic_forward_embedding_shape[2] == generic_backward_embedding_shape[2] # generic_forward_embedding = tf.transpose(generic_forward_embedding, [1, 0, 2]) # generic_backward_embedding = tf.transpose(generic_backward_embedding, [1, 0, 2]) # generic_forward_input_mask = tf.cast(tf.transpose(generic_forward_input_mask, [1, 0]), tf.float32) # generic_backward_input_mask = tf.cast(tf.transpose(generic_backward_input_mask, [1, 0]), tf.float32) nli_forward_embedding_shape = modeling.get_shape_list( nli_forward_embedding, expected_rank=3) nli_backward_embedding_shape = modeling.get_shape_list( nli_backward_embedding, expected_rank=3) assert nli_forward_embedding_shape[2] == nli_backward_embedding_shape[2] nli_forward_embedding = tf.transpose(nli_forward_embedding, [1, 0, 2]) nli_backward_embedding = tf.transpose(nli_backward_embedding, [1, 0, 2]) nli_forward_input_mask = tf.cast( tf.transpose(nli_forward_input_mask, [1, 0]), tf.float32) nli_backward_input_mask = tf.cast( tf.transpose(nli_backward_input_mask, [1, 0]), tf.float32) model = HadeModel( x_random_forward=random_forward_embedding, x_random_mask_forward=random_forward_input_mask, x_random_length_forward=random_forward_text_len, x_random_backward=random_backward_embedding, x_random_mask_backward=random_backward_input_mask, x_random_length_backward=random_backward_text_len, y_random=random_labels, x_swap_forward=swap_forward_embedding, x_swap_mask_forward=swap_forward_input_mask, x_swap_length_forward=swap_forward_text_len, x_swap_backward=swap_backward_embedding, x_swap_mask_backward=swap_backward_input_mask, x_swap_length_backward=swap_backward_text_len, y_swap=swap_labels, # x_generic_forward=generic_forward_embedding, # x_generic_mask_forward=generic_forward_input_mask, # x_generic_length_forward=generic_forward_text_len, # x_generic_backward=generic_backward_embedding, # x_generic_mask_backward=generic_backward_input_mask, # x_generic_length_backward=generic_backward_text_len, y_generic=generic_labels, x_nli_forward=nli_forward_embedding, x_nli_mask_forward=nli_forward_input_mask, x_nli_length_forward=nli_forward_text_len, x_nli_backward=nli_backward_embedding, x_nli_mask_backward=nli_backward_input_mask, x_nli_length_backward=nli_backward_text_len, y_nli=nli_labels, embedding_dim=random_forward_embedding_shape[2], num_nli_labels=num_nli_labels, hidden_size=lstm_size, l2_reg_lambda=l2_reg_lambda, num_layers=num_layers, dropout_rate=dropout_rate, is_training=is_training) random_prob, swap_prob, nli_prob, total_cost = model.create_model() return random_prob, swap_prob, nli_prob, total_cost, final_lm_loss, perplexity