def get_masked_lm_output(bert_config, input_tensor, output_weights, positions): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def encode_block(bert_config, input_ids, input_masks, segment_ids, use_one_hot_embeddings, num_vec, is_training): """Encode text and get multi-vector representations.""" with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=input_ids, input_mask=input_masks, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, scope="bert") emb_dim = bert_config.hidden_size output_layer, mask = get_multi_vectors(model, input_masks, num_vec) # [batch_size, num_vec, hidden_size], [batch_size, num_vec] output_layer.set_shape([None, None, emb_dim]) if FLAGS.projection_size > 0: with tf.variable_scope("projected_layer", reuse=tf.AUTO_REUSE): output_layer = tf.layers.dense(output_layer, FLAGS.projection_size) emb_dim = FLAGS.projection_size output_layer.set_shape([None, None, emb_dim]) if FLAGS.layer_norm: output_layer = modeling.layer_norm(output_layer) else: output_layer = tf.math.l2_normalize(output_layer, axis=-1) return output_layer, mask
def __init__( self, ): self.X = tf.placeholder(tf.int32, [None, None]) model = modeling.BertModel( config=bert_config, is_training=False, input_ids=self.X, use_one_hot_embeddings=False) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units = bert_config.hidden_size, activation = modeling.get_activation(bert_config.hidden_act), kernel_initializer = modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape = [bert_config.vocab_size], initializer = tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b = True) self.logits = tf.nn.bias_add(logits, output_bias)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, top_k_indices, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs_student = tf.nn.log_softmax(logits, axis=-1) probs_student = tf.nn.softmax(logits, axis=-1) prob_shape = tf.shape(log_probs_student) new_shape = [prob_shape[0], truncation_factor] #[batch_size*seq_len,truncation_factor] top_k_indices = tf.reshape(top_k_indices, new_shape) top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices) top_k_probs_student = tf.batch_gather(probs_student, top_k_indices) return top_k_log_probs_student, top_k_probs_student
def get_masked_lm_output(self): self.input_tensor = self.gather_indexes() with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): self.input_tensor = tf.layers.dense( self.input_tensor, units=self.bert_config.hidden_size, activation=modeling.get_activation( self.bert_config.hidden_act), kernel_initializer=modeling.create_initializer( self.bert_config.initializer_range)) self.input_tensor = modeling.layer_norm(self.input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[self.bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(self.input_tensor, self.output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) flat_masked_lm_ids = tf.reshape(self.masked_lm_ids, [-1]) one_hot_labels = tf.one_hot(flat_masked_lm_ids, depth=self.bert_config.vocab_size, dtype=tf.float32) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # TODO: dynamic gather from per_example_loss??? loss = tf.reshape(per_example_loss, [-1, tf.shape(self.masked_lm_positions)[1]]) return loss
def mlp(net, weights, biases): for i, (w, b) in enumerate(zip(weights, biases)): dropout_rate = float(expt_flags.get('mlp_dropout_rate', 0.0)) if dropout_rate > 0.0 and is_training: net = modeling.dropout(net, dropout_rate) if eval(expt_flags.get('mlp_layer_norm', 'False')): net = modeling.layer_norm(net) net = tf.nn.bias_add(tf.matmul(net, w, transpose_b=True), b) if i < len(weights) - 1: net = modeling.gelu(net) return net
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_weights, truncated_masked_lm_probs_teacher, top_k_indices, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs_student = tf.nn.log_softmax(logits, axis=-1) label_weights = tf.reshape(label_weights, [-1]) prob_shape = tf.shape(log_probs_student) new_shape = [prob_shape[0], truncation_factor ] #[batch_size*seq_len,truncation_factor] top_k_indices = tf.reshape(top_k_indices, new_shape) top_k_log_probs_student = tf.batch_gather(log_probs_student, top_k_indices) truncated_masked_lm_probs_teacher = tf.reshape( truncated_masked_lm_probs_teacher, new_shape) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum( truncated_masked_lm_probs_teacher * top_k_log_probs_student, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs_student)
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids, label_weights): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights_flat = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights_flat * per_example_loss) denominator = tf.reduce_sum(label_weights_flat) + 1e-5 loss = numerator / denominator batch_size = tf.cast(tf.shape(label_weights)[0], tf.float32) print('==============') print(label_weights.shape) print('==============') loss = batch_size * loss return (loss, per_example_loss, log_probs)
def forward(x, segment, masks, y, reuse=False, config=bert_config): with tf.variable_scope('bert', reuse=reuse): model = modeling.BertModel( config=config, is_training=training, input_ids=x, input_mask=masks, token_type_ids=segment, use_one_hot_embeddings=False, ) memory = model.get_sequence_output() with tf.variable_scope('bert', reuse=True): Y_seq_len = tf.count_nonzero(y, 1, dtype=tf.int32) y_masks = tf.sequence_mask(Y_seq_len, tf.reduce_max(Y_seq_len), dtype=tf.float32) model = modeling_decoder.BertModel( config=config, is_training=training, input_ids=y, input_mask=y_masks, memory=memory, memory_mask=masks, use_one_hot_embeddings=False, ) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() with tf.variable_scope('cls/predictions', reuse=reuse): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units=config.hidden_size, activation=modeling.get_activation( bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b=True) return logits
def bert_module_fn(is_training): """Spec function for a token embedding module.""" input_ids = tf.placeholder(shape=[None, None], dtype=tf.int32, name="input_ids") bert_config = modeling.BertConfig.from_json_file(config_path) model = modeling.BertModel(config=bert_config, is_training=is_training, input_ids=input_ids) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) config_file = tf.constant(value=config_path, dtype=tf.string, name="config_file") vocab_file = tf.constant(value=vocab_path, dtype=tf.string, name="vocab_file") lower_case = tf.constant(do_lower_case) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, config_file) tf.add_to_collection(tf.GraphKeys.ASSET_FILEPATHS, vocab_file) input_map = {"input_ids": input_ids} output_map = {"logits": logits} output_info_map = {"vocab_file": vocab_file, "do_lower_case": lower_case} hub.add_signature(name="tokens", inputs=input_map, outputs=output_map) hub.add_signature(name="tokenization_info", inputs={}, outputs=output_info_map)
def compute_logits(self, target_emb, reuse=None): """Compute logits for word prediction.""" with tf.variable_scope(self.scope_prefix + "cls/predictions", reuse=reuse): with tf.variable_scope("transform"): target_emb = tf.layers.dense( target_emb, units=self.config.hidden_size, activation=modeling.get_activation(self.config.hidden_act), kernel_initializer=self.initializer) target_emb = modeling.layer_norm(target_emb) output_bias = tf.get_variable("output_bias", shape=[self.config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(target_emb, self.embedding_table, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) return logits
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, label_ids): """Get loss and log probs for the masked LM.""" print("input tensor before gather_indexes:", input_tensor) input_tensor = gather_indexes(input_tensor, positions) print("input tensor before gather_indexes:", input_tensor) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) print(label_ids) label_ids = tf.reshape(label_ids, [-1]) print(label_ids) one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) print(one_hot_labels) print(log_probs) per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) print(per_example_loss) loss = tf.reshape(per_example_loss, [-1, tf.shape(positions)[1]]) print('positions: ', positions) print('loss', loss) # TODO: dynamic gather from per_example_loss??? return loss
def __init__(self, config, input_hidden, embedding_table): # Keep variable names the same as BERT with tf.variable_scope("cls"): with tf.variable_scope("predictions"): with tf.variable_scope("transform"): self.transformed_output = tf.layers.dense( input_hidden, config.hidden_size, activation=modeling.get_activation(config.hidden_act), kernel_initializer=modeling.create_initializer( config.initializer_range)) self.transformed_output = modeling.layer_norm( self.transformed_output) output_bias = tf.Variable(tf.zeros([config.vocab_size]), name="output_bias") self.final_output = tf.add( tf.matmul(self.transformed_output, tf.transpose(embedding_table)), output_bias) self.probs = tf.nn.softmax(self.final_output, name='token_probs')
def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, truncation_factor): """Get loss and log probs for the masked LM.""" input_tensor = gather_indexes(input_tensor, positions) with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) masked_lm_probs = tf.nn.softmax(logits, axis=-1) trunc_masked_lm_probs, top_indices = tf.math.top_k(masked_lm_probs, k=truncation_factor, sorted=False) max_predictions_per_seq = positions.get_shape().as_list()[1] truncation_factor_ = top_indices.get_shape().as_list()[1] trunc_masked_lm_probs = tf.reshape( trunc_masked_lm_probs, [-1, max_predictions_per_seq, truncation_factor_]) top_indices = tf.reshape( top_indices, [-1, max_predictions_per_seq, truncation_factor_]) return trunc_masked_lm_probs, top_indices
def __init__( self, ): BERT_CONFIG = "PATH_TO/multi_cased_L-12_H-768_A-12/bert_config.json" bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG) self.X = tf.placeholder(tf.int32, [None, None]) model = modeling.BertModel( config=bert_config, is_training=False, input_ids=self.X, use_one_hot_embeddings=False, ) output_layer = model.get_sequence_output() embedding = model.get_embedding_table() output_layer = tf.reshape(output_layer, [-1, bert_config.hidden_size]) with tf.variable_scope("cls/predictions"): with tf.variable_scope("transform"): input_tensor = tf.layers.dense( output_layer, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( "output_bias", shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b=True) print("---") self.logits = tf.nn.bias_add(logits, output_bias)
def __init__( self, bert_config, input_ids, input_mask, token_type_ids, Y, is_training=True, ): self.X = input_ids self.segment_ids = token_type_ids self.input_masks = input_mask self.Y = Y self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32) self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32) batch_size = tf.shape(self.X)[0] model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=self.X, input_mask=self.input_masks, token_type_ids=self.segment_ids, use_one_hot_embeddings=False, ) print(bert_config.__dict__) BASE_PARAMS = defaultdict( lambda: None, default_batch_size=2048, default_batch_size_tpu=32768, max_length=bert_config.max_position_embeddings, initializer_gain=1.0, vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_hidden_layers=bert_config.num_hidden_layers, num_heads=bert_config.num_attention_heads, filter_size=bert_config.intermediate_size, layer_postprocess_dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, label_smoothing=0.1, learning_rate=1.0, learning_rate_decay_rate=1.0, learning_rate_warmup_steps=16000, optimizer_adam_beta1=0.9, optimizer_adam_beta2=0.997, optimizer_adam_epsilon=1e-09, extra_decode_length=50, beam_size=4, alpha=0.6, use_tpu=False, static_batch=False, allow_ffn_pad=True, ) self.decoder_stack = DecoderStack(BASE_PARAMS, is_training) attention_bias = model_utils.get_padding_bias(self.X) output_layer = model.get_sequence_output() pooled_output = model.get_pooled_output() embedding = model.get_embedding_table() with tf.name_scope('decode'): mask = tf.to_float(tf.not_equal(self.Y, 0)) decoder_inputs = tf.gather(embedding, self.Y) decoder_inputs *= tf.expand_dims(mask, -1) with tf.name_scope('shift_targets'): decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope('add_pos_encoding'): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, BASE_PARAMS['hidden_size']) if is_training: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - BASE_PARAMS['layer_postprocess_dropout']) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, output_layer, decoder_self_attention_bias, attention_bias, ) with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( outputs, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) self.training_logits = tf.matmul(input_tensor, embedding, transpose_b=True) print(self.training_logits)
def __init__(self, bert_config, tokenizer, cls, sep): _graph = tf.Graph() with _graph.as_default(): self.X = tf.placeholder(tf.int32, [None, None]) self.top_p = tf.placeholder(tf.float32, None) self.top_k = tf.placeholder(tf.int32, None) self.k = tf.placeholder(tf.int32, None) self.temperature = tf.placeholder(tf.float32, None) self.indices = tf.placeholder(tf.int32, [None, None]) self._tokenizer = tokenizer self._cls = cls self._sep = sep self.model = modeling.BertModel( config = bert_config, is_training = False, input_ids = self.X, use_one_hot_embeddings = False, ) self.logits = self.model.get_pooled_output() output_layer = self.model.get_sequence_output() embedding = self.model.get_embedding_table() with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( output_layer, units = bert_config.hidden_size, activation = modeling.get_activation( bert_config.hidden_act ), kernel_initializer = modeling.create_initializer( bert_config.initializer_range ), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape = [bert_config.vocab_size], initializer = tf.zeros_initializer(), ) logits = tf.matmul(input_tensor, embedding, transpose_b = True) self._logits = tf.nn.bias_add(logits, output_bias) self._log_softmax = tf.nn.log_softmax(self._logits) logits = tf.gather_nd(self._logits, self.indices) logits = logits / self.temperature def necleus(): return top_p_logits(logits, self.top_p) def select_k(): return top_k_logits(logits, self.top_k) logits = tf.cond(self.top_p > 0, necleus, select_k) self.samples = tf.multinomial( logits, num_samples = self.k, output_dtype = tf.int32 ) self._sess = tf.InteractiveSession() self._sess.run(tf.global_variables_initializer()) var_lists = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'bert' ) cls = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope = 'cls' ) self._saver = tf.train.Saver(var_list = var_lists + cls) attns = _extract_attention_weights( bert_config.num_hidden_layers, tf.get_default_graph() ) self.attns = attns
def create_model(bert_config, is_training, fewshot_num_examples_per_class, input_ids, input_mask, segment_ids, use_one_hot_embeddings, tokenizer=None, class_examples_combiner="max"): """Creates a classification model.""" if not is_training: bert_config.hidden_dropout_prob = 0.0 bert_config.attention_probs_dropout_prob = 0.0 # unroll fewshot batches to extract BERT representations. fewshot_size = input_ids.shape[1].value sequence_length = input_ids.shape[2].value bert_input_ids = tf.reshape(input_ids, [-1, sequence_length]) bert_input_mask = tf.reshape(input_mask, [-1, sequence_length]) bert_segment_ids = tf.reshape(segment_ids, [-1, sequence_length]) tf.logging.info( "shapes %s %s %s" % (bert_input_ids.shape, bert_input_mask.shape, bert_segment_ids.shape)) model = modeling.BertModel( config=bert_config, is_training=FLAGS.train_bert_model if is_training else False, input_ids=bert_input_ids, input_mask=bert_input_mask, token_type_ids=bert_segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) # [batch_size, fewshot_size * seq_len, hidden_size] output_layer = model.get_sequence_output() tf.logging.info("BERT model output shape %s", output_layer.shape) # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, 2*hidden_size]. with tf.variable_scope("cls/entity_relation"): # [batch_size, fewshot_size, 2 * hidden_size] output_layer = extract_relation_representations( output_layer, bert_input_ids, tokenizer) output_layer = modeling.layer_norm(output_layer) def _combine_multi_example_logits(logits): """Combine per-example logits into a per-class logit.""" logits = tf.reshape( logits, [-1, fewshot_num_classes, fewshot_num_examples_per_class, 1]) if class_examples_combiner == "max": logits = tf.reduce_max(logits, axis=2) if class_examples_combiner == "mean": logits = tf.reduce_mean(logits, axis=2) if class_examples_combiner == "logsumexp": logits = tf.reduce_logsumexp(logits, axis=2) if class_examples_combiner == "min": logits = tf.reduce_min(logits, axis=2) if class_examples_combiner == "sigmoid_mean": logits = tf.sigmoid(logits) logits = tf.reduce_mean(logits, axis=2) return logits fewshot_num_classes = int( (fewshot_size - 1) / fewshot_num_examples_per_class) hidden_size = output_layer.shape[-1].value with tf.variable_scope("loss"): # [batch_size, fewshot_size, hidden_size] output_weights = tf.reshape(output_layer, [-1, fewshot_size, hidden_size]) # Extract query representation from output. # [batch_size, fewshot_size - 1, hidden_size] output_layer = tf.reshape(output_weights[:, 0, :], [-1, 1, hidden_size]) # Remove query from targets. # [batch_size, 1, hidden_size] output_weights = output_weights[:, 1:, :] # Dot product based distance metric. # [batch_size, fewshot_size - 1, 1] logits = tf.matmul(output_layer, output_weights, transpose_b=True) if fewshot_num_examples_per_class > 1: # [batch_size, fewshot_num_classes, 1] logits = _combine_multi_example_logits(logits) # [batch_size, fewshot_num_classes] logits = tf.reshape(logits, [-1, fewshot_num_classes]) return logits
def build_attn_layers(self, input_tensor, attn_mask_concat, intermediate_size=2048, intermediate_act_fn=modeling.gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False): """See `attention_layer` defined in `bert/modeling.py`""" if not self.is_training: hidden_dropout_prob = 0.0 attention_probs_dropout_prob = 0.0 # input tensor shape: [batch, arg_length, BERT_hidden_size] # for example, using default hparams vals: [64, 128, 768] attention_head_size = int(self.hidden_size / self.num_attention_heads) input_shape = modeling.get_shape_list(input_tensor, expected_rank=3) prev_output = input_tensor attention_type_split = self.attention_type.split("_") all_layer_outputs = [] for layer_idx in range(self.num_hidden_layers): with tf.variable_scope(f"layer_{layer_idx}"): layer_input = prev_output if len(attention_type_split) == 3: indexer = layer_idx % 2 else: # len(attention_type_split) == 2: indexer = 0 layer_attn_type = attention_type_split[indexer] tf.logging.info( f"{layer_attn_type.capitalize()} Attention at {layer_idx}th Layer") attention_heads = [] with tf.variable_scope(f"{layer_attn_type}_attn"): attention_head = self.build_attn_layer( input_tensor=input_tensor, attn_mask_concat=attn_mask_concat, layer_attn_type=layer_attn_type, num_attention_heads=self.num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=attention_probs_dropout_prob, initializer_range=initializer_range, do_return_2d_tensor=False ) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.variable_scope("output"): attention_output = tf.layers.dense( attention_output, self.hidden_size, kernel_initializer=modeling.create_initializer(initializer_range)) attention_output = modeling.dropout(attention_output, hidden_dropout_prob) attention_output = modeling.layer_norm(attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.variable_scope("intermediate"): intermediate_output = tf.layers.dense( attention_output, intermediate_size, activation=intermediate_act_fn, kernel_initializer=modeling.create_initializer(initializer_range)) # Down-project back to `hidden_size` then add the residual. with tf.variable_scope("output"): layer_output = tf.layers.dense( intermediate_output, self.hidden_size, kernel_initializer=modeling.create_initializer(initializer_range)) layer_output = modeling.dropout(layer_output, hidden_dropout_prob) layer_output = modeling.layer_norm(layer_output + attention_output) prev_output = layer_output all_layer_outputs.append(layer_output) if do_return_all_layers: final_outputs = [] for layer_output in all_layer_outputs: final_output = modeling.reshape_from_matrix(layer_output, input_shape) final_outputs.append(final_output) return final_outputs else: final_output = modeling.reshape_from_matrix(prev_output, input_shape) return final_output
def __call__(self, features, hidden_feature, mode, problem_name): """Get loss and log probs for the masked LM. DO NOT CHANGE THE VARAIBLE SCOPE. """ seq_hidden_feature = hidden_feature['seq'] positions = features['masked_lm_positions'] input_tensor = gather_indexes(seq_hidden_feature, positions) output_weights = hidden_feature['embed_table'] label_ids = features['masked_lm_ids'] label_weights = features['masked_lm_weights'] with tf.variable_scope("cls/predictions"): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=self.params.mask_lm_hidden_size, activation=modeling.get_activation( self.params.mask_lm_hidden_act), kernel_initializer=modeling.create_initializer( self.params.mask_lm_initializer_range)) input_tensor = modeling.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[self.params.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) self.logits = logits log_probs = tf.nn.log_softmax(logits, axis=-1) if mode == tf.estimator.ModeKeys.PREDICT: self.prob = log_probs return self.prob else: label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.reshape(label_weights, [-1]) one_hot_labels = tf.one_hot(label_ids, depth=self.params.vocab_size, dtype=tf.float32) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. per_example_loss = - \ tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator if mode == tf.estimator.ModeKeys.TRAIN: self.loss = loss return self.loss else: def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights): """Computes the loss and accuracy of the model.""" masked_lm_log_probs = tf.reshape( masked_lm_log_probs, [-1, masked_lm_log_probs.shape[-1]]) masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) masked_lm_example_loss = tf.reshape( masked_lm_example_loss, [-1]) masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) masked_lm_accuracy = tf.metrics.accuracy( labels=masked_lm_ids, predictions=masked_lm_predictions, weights=masked_lm_weights) masked_lm_mean_loss = tf.metrics.mean( values=masked_lm_example_loss, weights=masked_lm_weights) return { "masked_lm_accuracy": masked_lm_accuracy, "masked_lm_loss": masked_lm_mean_loss, } eval_metrics = (metric_fn(per_example_loss, log_probs, label_ids, label_weights), loss) self.eval_metrics = eval_metrics return self.eval_metrics
def compute_transformer( input_tensor, attention_mask, hidden_size, num_hidden_layers, num_attention_heads, intermediate_size, intermediate_act_fn, hidden_dropout_prob, attention_probs_dropout_prob, initializer_range, input_cache, ): """Multi-headed, multi-layer Transformer.""" attention_mask = tf.cast(attention_mask, tf.float32) attention_head_size = int(hidden_size / num_attention_heads) prev_output = input_tensor if input_cache is not None: input_cache = TransformerCache(keys=tf.unstack(input_cache.keys, axis=2), values=tf.unstack(input_cache.values, axis=2)) output_cache = [] for layer_idx in range(num_hidden_layers): with tf.variable_scope("layer_%d" % layer_idx): layer_input = prev_output with tf.variable_scope("attention"): with tf.variable_scope("self"): if input_cache is not None: layer_input_cache = TransformerCache( keys=input_cache.keys[layer_idx], values=input_cache.values[layer_idx]) else: layer_input_cache = None attention_output, layer_output_cache = attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob= attention_probs_dropout_prob, initializer_range=initializer_range, input_cache=layer_input_cache) output_cache.append(layer_output_cache) with tf.variable_scope("output"): attention_output = dense_layer_3d_proj( attention_output, hidden_size, num_attention_heads, attention_head_size, modeling.create_initializer(initializer_range), None, "dense") attention_output = modeling.dropout( attention_output, hidden_dropout_prob) attention_output = modeling.layer_norm(attention_output + layer_input) with tf.variable_scope("intermediate"): intermediate_output = dense_layer_2d( attention_output, intermediate_size, modeling.create_initializer(initializer_range), intermediate_act_fn, "dense") with tf.variable_scope("output"): layer_output = dense_layer_2d( intermediate_output, hidden_size, modeling.create_initializer(initializer_range), None, "dense") layer_output = modeling.dropout(layer_output, hidden_dropout_prob) layer_output = modeling.layer_norm(layer_output + attention_output) prev_output = layer_output # [batch_size, seq_len, num_layers, num_heads, head_size] output_cache = TransformerCache( keys=tf.stack([c.keys for c in output_cache], 2), values=tf.stack([c.values for c in output_cache], 2)) return prev_output, output_cache
def __init__( self, input_ids, input_mask, token_type_ids, Y, learning_rate=2e-5, is_training=True, ): self.X = input_ids self.segment_ids = token_type_ids self.input_masks = input_mask self.Y = Y self.X_seq_len = tf.count_nonzero(self.X, 1, dtype=tf.int32) self.Y_seq_len = tf.count_nonzero(self.Y, 1, dtype=tf.int32) batch_size = tf.shape(self.X)[0] model = modeling.BertModel( config=bert_config, is_training=is_training, input_ids=self.X, input_mask=self.input_masks, token_type_ids=self.segment_ids, use_one_hot_embeddings=False, ) self.decoder_stack = DecoderStack(BASE_PARAMS, is_training) attention_bias = model_utils.get_padding_bias(self.X) output_layer = model.get_sequence_output() pooled_output = model.get_pooled_output() embedding = model.get_embedding_table() with tf.name_scope('decode'): mask = tf.to_float(tf.not_equal(self.Y, 0)) decoder_inputs = tf.gather(embedding, self.Y) decoder_inputs *= tf.expand_dims(mask, -1) with tf.name_scope('shift_targets'): decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope('add_pos_encoding'): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, BASE_PARAMS['hidden_size']) if is_training: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - BASE_PARAMS['layer_postprocess_dropout']) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, output_layer, decoder_self_attention_bias, attention_bias, ) with tf.variable_scope('cls/predictions'): with tf.variable_scope('transform'): input_tensor = tf.layers.dense( outputs, units=bert_config.hidden_size, activation=modeling.get_activation(bert_config.hidden_act), kernel_initializer=modeling.create_initializer( bert_config.initializer_range), ) input_tensor = modeling.layer_norm(input_tensor) output_bias = tf.get_variable( 'output_bias', shape=[bert_config.vocab_size], initializer=tf.zeros_initializer(), ) self.training_logits = tf.matmul(input_tensor, embedding, transpose_b=True) print(self.training_logits)
def create_bilstm_classification_model(bert_config, is_training, response_input_ids, response_input_mask, response_segment_ids, response_text_len, response_labels, random_forward_input_ids, random_forward_input_mask, random_forward_segment_ids, random_forward_text_len, random_backward_input_ids, random_backward_input_mask, random_backward_segment_ids, random_backward_text_len, random_labels, swap_forward_input_ids, swap_forward_input_mask, swap_forward_segment_ids, swap_forward_text_len, swap_backward_input_ids, swap_backward_input_mask, swap_backward_segment_ids, swap_backward_text_len, swap_labels, nli_forward_input_ids, nli_forward_input_mask, nli_forward_segment_ids, nli_forward_text_len, nli_backward_input_ids, nli_backward_input_mask, nli_backward_segment_ids, nli_backward_text_len, nli_labels, num_nli_labels, use_one_hot_embeddings, l2_reg_lambda=0.1, dropout_rate=1.0, lstm_size=None, num_layers=1): config = copy.deepcopy(bert_config) if not is_training: config.hidden_dropout_prob = 0.0 config.attention_probs_dropout_prob = 0.0 with tf.variable_scope("bert", reuse=tf.AUTO_REUSE): with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): (response_embedding_output, response_embedding_table) = modeling.embedding_lookup( input_ids=response_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) response_embedding_output = modeling.embedding_postprocessor( input_tensor=response_embedding_output, use_token_type=not config.roberta, token_type_ids=response_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # random detection # Perform embedding lookup on the word ids. (random_foward_embedding_output, random_forward_embedding_table) = modeling.embedding_lookup( input_ids=random_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Perform embedding lookup on the word ids. (random_backward_embedding_output, random_backward_embedding_table) = modeling.embedding_lookup( input_ids=random_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. random_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) random_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=random_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=random_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # swap detection (swap_foward_embedding_output, swap_forward_embedding_table) = modeling.embedding_lookup( input_ids=swap_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (swap_backward_embedding_output, swap_backward_embedding_table) = modeling.embedding_lookup( input_ids=swap_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) swap_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) swap_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=swap_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=swap_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) # # generic detection # (generic_foward_embedding_output, generic_forward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_forward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # (generic_backward_embedding_output, generic_backward_embedding_table) = modeling.embedding_lookup( # input_ids=generic_backward_input_ids, # vocab_size=config.vocab_size, # embedding_size=config.hidden_size, # initializer_range=config.initializer_range, # word_embedding_name="word_embeddings", # use_one_hot_embeddings=use_one_hot_embeddings) # generic_foward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_foward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_forward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # generic_backward_embedding_output = modeling.embedding_postprocessor( # input_tensor=generic_backward_embedding_output, # use_token_type=not config.roberta, # token_type_ids=generic_backward_segment_ids, # token_type_vocab_size=config.type_vocab_size, # token_type_embedding_name="token_type_embeddings", # use_position_embeddings=True, # position_embedding_name="position_embeddings", # initializer_range=config.initializer_range, # max_position_embeddings=config.max_position_embeddings, # dropout_prob=config.hidden_dropout_prob, # roberta=config.roberta) # nli detection (nli_foward_embedding_output, nli_forward_embedding_table) = modeling.embedding_lookup( input_ids=nli_forward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) (nli_backward_embedding_output, nli_backward_embedding_table) = modeling.embedding_lookup( input_ids=nli_backward_input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=use_one_hot_embeddings) nli_foward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_foward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_forward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) nli_backward_embedding_output = modeling.embedding_postprocessor( input_tensor=nli_backward_embedding_output, use_token_type=not config.roberta, token_type_ids=nli_backward_segment_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob, roberta=config.roberta) with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): response_attention_mask = modeling.create_attention_mask_from_input_mask( response_input_ids, response_input_mask) # [batch_size, from_seq_length, to_seq_length] # mask future tokens diag_vals = tf.ones_like(response_attention_mask[0, :, :]) tril = tf.linalg.LinearOperatorLowerTriangular( diag_vals).to_dense() future_masks = tf.tile(tf.expand_dims( tril, 0), [tf.shape(response_attention_mask)[0], 1, 1]) response_attention_mask = tf.math.multiply(response_attention_mask, future_masks) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. response_all_encoder_layers = modeling.transformer_model( input_tensor=response_embedding_output, attention_mask=response_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # random detection # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. random_forward_attention_mask = modeling.create_attention_mask_from_input_mask( random_forward_input_ids, random_forward_input_mask) random_backward_attention_mask = modeling.create_attention_mask_from_input_mask( random_backward_input_ids, random_backward_input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. random_forward_all_encoder_layers = modeling.transformer_model( input_tensor=random_foward_embedding_output, attention_mask=random_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_backward_all_encoder_layers = modeling.transformer_model( input_tensor=random_backward_embedding_output, attention_mask=random_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # swap detection swap_forward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_forward_input_ids, swap_forward_input_mask) swap_backward_attention_mask = modeling.create_attention_mask_from_input_mask( swap_backward_input_ids, swap_backward_input_mask) swap_forward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_foward_embedding_output, attention_mask=swap_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) swap_backward_all_encoder_layers = modeling.transformer_model( input_tensor=swap_backward_embedding_output, attention_mask=swap_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) # # generic detection # generic_forward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_forward_input_ids, # generic_forward_input_mask) # generic_backward_attention_mask = modeling.create_attention_mask_from_input_mask(generic_backward_input_ids, # generic_backward_input_mask) # generic_forward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_foward_embedding_output, # attention_mask=generic_forward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # generic_backward_all_encoder_layers = modeling.transformer_model( # input_tensor=generic_backward_embedding_output, # attention_mask=generic_backward_attention_mask, # hidden_size=config.hidden_size, # num_hidden_layers=config.num_hidden_layers, # num_attention_heads=config.num_attention_heads, # intermediate_size=config.intermediate_size, # intermediate_act_fn=modeling.get_activation(config.hidden_act), # hidden_dropout_prob=config.hidden_dropout_prob, # attention_probs_dropout_prob=config.attention_probs_dropout_prob, # initializer_range=config.initializer_range, # do_return_all_layers=True) # nli detection nli_forward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_forward_input_ids, nli_forward_input_mask) nli_backward_attention_mask = modeling.create_attention_mask_from_input_mask( nli_backward_input_ids, nli_backward_input_mask) nli_forward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_foward_embedding_output, attention_mask=nli_forward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) nli_backward_all_encoder_layers = modeling.transformer_model( input_tensor=nli_backward_embedding_output, attention_mask=nli_backward_attention_mask, hidden_size=config.hidden_size, num_hidden_layers=config.num_hidden_layers, num_attention_heads=config.num_attention_heads, intermediate_size=config.intermediate_size, intermediate_act_fn=modeling.get_activation(config.hidden_act), hidden_dropout_prob=config.hidden_dropout_prob, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_all_layers=True) random_forward_embedding = random_forward_all_encoder_layers[-2] random_backward_embedding = random_backward_all_encoder_layers[-2] swap_forward_embedding = swap_forward_all_encoder_layers[-2] swap_backward_embedding = swap_backward_all_encoder_layers[-2] # generic_forward_embedding = generic_forward_all_encoder_layers[-2] # generic_backward_embedding = generic_backward_all_encoder_layers[-2] nli_forward_embedding = nli_forward_all_encoder_layers[-2] nli_backward_embedding = nli_backward_all_encoder_layers[-2] response_embedding = response_all_encoder_layers[-2] response_embedding_shape = modeling.get_shape_list(response_embedding, expected_rank=3) with tf.variable_scope("lm_head", reuse=tf.AUTO_REUSE): response_logits = tf.layers.dense(response_embedding, config.hidden_size, activation=None) response_logits = modeling.gelu(response_logits) response_logits = modeling.layer_norm(response_logits) response_outputs = tf.layers.dense( response_logits, config.vocab_size, activation=None, use_bias=True, bias_initializer=tf.zeros_initializer()) response_one_hot = tf.one_hot(response_labels, depth=config.vocab_size, dtype=tf.float32) lm_cost = tf.nn.softmax_cross_entropy_with_logits( labels=response_one_hot, logits=response_outputs) sequence_mask = tf.sequence_mask(response_text_len, maxlen=response_embedding_shape[1], dtype=tf.float32) masked_lm_cost = tf.math.multiply(lm_cost, sequence_mask) final_lm_loss = tf.reduce_mean( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) perplexity = tf.exp( tf.math.divide(tf.reduce_sum(masked_lm_cost, axis=1), tf.cast(response_text_len, dtype=tf.float32))) random_forward_embedding_shape = modeling.get_shape_list( random_forward_embedding, expected_rank=3) random_backward_embedding_shape = modeling.get_shape_list( random_backward_embedding, expected_rank=3) assert random_forward_embedding_shape[ 2] == random_backward_embedding_shape[2] random_forward_embedding = tf.transpose(random_forward_embedding, [1, 0, 2]) random_backward_embedding = tf.transpose(random_backward_embedding, [1, 0, 2]) random_forward_input_mask = tf.cast( tf.transpose(random_forward_input_mask, [1, 0]), tf.float32) random_backward_input_mask = tf.cast( tf.transpose(random_backward_input_mask, [1, 0]), tf.float32) swap_forward_embedding_shape = modeling.get_shape_list( swap_forward_embedding, expected_rank=3) swap_backward_embedding_shape = modeling.get_shape_list( swap_backward_embedding, expected_rank=3) assert swap_forward_embedding_shape[2] == swap_backward_embedding_shape[2] swap_forward_embedding = tf.transpose(swap_forward_embedding, [1, 0, 2]) swap_backward_embedding = tf.transpose(swap_backward_embedding, [1, 0, 2]) swap_forward_input_mask = tf.cast( tf.transpose(swap_forward_input_mask, [1, 0]), tf.float32) swap_backward_input_mask = tf.cast( tf.transpose(swap_backward_input_mask, [1, 0]), tf.float32) # generic_forward_embedding_shape = modeling.get_shape_list(generic_forward_embedding, expected_rank=3) # generic_backward_embedding_shape = modeling.get_shape_list(generic_backward_embedding, expected_rank=3) # assert generic_forward_embedding_shape[2] == generic_backward_embedding_shape[2] # generic_forward_embedding = tf.transpose(generic_forward_embedding, [1, 0, 2]) # generic_backward_embedding = tf.transpose(generic_backward_embedding, [1, 0, 2]) # generic_forward_input_mask = tf.cast(tf.transpose(generic_forward_input_mask, [1, 0]), tf.float32) # generic_backward_input_mask = tf.cast(tf.transpose(generic_backward_input_mask, [1, 0]), tf.float32) nli_forward_embedding_shape = modeling.get_shape_list( nli_forward_embedding, expected_rank=3) nli_backward_embedding_shape = modeling.get_shape_list( nli_backward_embedding, expected_rank=3) assert nli_forward_embedding_shape[2] == nli_backward_embedding_shape[2] nli_forward_embedding = tf.transpose(nli_forward_embedding, [1, 0, 2]) nli_backward_embedding = tf.transpose(nli_backward_embedding, [1, 0, 2]) nli_forward_input_mask = tf.cast( tf.transpose(nli_forward_input_mask, [1, 0]), tf.float32) nli_backward_input_mask = tf.cast( tf.transpose(nli_backward_input_mask, [1, 0]), tf.float32) model = HadeModel( x_random_forward=random_forward_embedding, x_random_mask_forward=random_forward_input_mask, x_random_length_forward=random_forward_text_len, x_random_backward=random_backward_embedding, x_random_mask_backward=random_backward_input_mask, x_random_length_backward=random_backward_text_len, y_random=random_labels, x_swap_forward=swap_forward_embedding, x_swap_mask_forward=swap_forward_input_mask, x_swap_length_forward=swap_forward_text_len, x_swap_backward=swap_backward_embedding, x_swap_mask_backward=swap_backward_input_mask, x_swap_length_backward=swap_backward_text_len, y_swap=swap_labels, # x_generic_forward=generic_forward_embedding, # x_generic_mask_forward=generic_forward_input_mask, # x_generic_length_forward=generic_forward_text_len, # x_generic_backward=generic_backward_embedding, # x_generic_mask_backward=generic_backward_input_mask, # x_generic_length_backward=generic_backward_text_len, y_generic=generic_labels, x_nli_forward=nli_forward_embedding, x_nli_mask_forward=nli_forward_input_mask, x_nli_length_forward=nli_forward_text_len, x_nli_backward=nli_backward_embedding, x_nli_mask_backward=nli_backward_input_mask, x_nli_length_backward=nli_backward_text_len, y_nli=nli_labels, embedding_dim=random_forward_embedding_shape[2], num_nli_labels=num_nli_labels, hidden_size=lstm_size, l2_reg_lambda=l2_reg_lambda, num_layers=num_layers, dropout_rate=dropout_rate, is_training=is_training) random_prob, swap_prob, nli_prob, total_cost = model.create_model() return random_prob, swap_prob, nli_prob, total_cost, final_lm_loss, perplexity