def get_diff_loss(bert_config, input_tensor, masked_lm_positions, masked_lm_weights, loss_base, loss_target): base_prob = tf.exp(-loss_base) target_prob = tf.exp(-loss_target) prob_diff = base_prob - target_prob input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions) with tf.compat.v1.variable_scope("diff_loss"): hidden = bc.dense(bert_config.hidden_size, bc.create_initializer(bert_config.initializer_range), bc.get_activation( bert_config.hidden_act))(input_tensor) logits = bc.dense(1, bc.create_initializer( bert_config.initializer_range))(hidden) logits = tf.reshape(logits, prob_diff.shape) per_example_loss = tf.abs(prob_diff - logits) per_example_loss = tf.cast(masked_lm_weights, tf.float32) * per_example_loss losses = tf.reduce_sum(per_example_loss, axis=1) loss = tf.reduce_mean(losses) return loss, per_example_loss, logits
def __init__(self, hidden_size, intermediate_size, hidden_act, hidden_dropout_prob, initializer): super(ResidualFeedforward, self).__init__() self.intermediate_ff = bc.dense( intermediate_size, initializer, activation=bc.get_activation(hidden_act)) self.hidden_dropout_prob = hidden_dropout_prob self.output_ff = bc.dense(hidden_size, initializer)
def __init__(self, bert_config): initializer = bc.create_initializer(bert_config.initializer_range) self.layer1 = bc.dense(bert_config.hidden_size, initializer, bc.get_activation(bert_config.hidden_act)) self.logit_dense1 = bc.dense(2, initializer) self.logit_dense2 = bc.dense(2, initializer) self.graph_built = False
def __init__(self, config, initializer): self.config = config self.self_attention = SelfAttentionLayer(config) with tf.compat.v1.variable_scope("intermediate"): self.intermediate_ff = bc.dense(self.config.intermediate_size, initializer, activation=bc.get_activation( self.config.hidden_act)) with tf.compat.v1.variable_scope("output"): self.output_ff = bc.dense(config.hidden_size, initializer)
def build_by_attention(self, key): hidden_size = self.config.hidden_size with tf.compat.v1.variable_scope("embeddings"): lexical_tensor = self.get_lexical_lookup() self.embedding_output = self.embedding_postprocessor( d_input_ids=self.input_ids, input_tensor=lexical_tensor, use_token_type=True, token_type_ids=self.segment_ids, token_type_vocab_size=self.config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=self.config.initializer_range, max_position_embeddings=self.config.max_position_embeddings, dropout_prob=self.config.hidden_dropout_prob) input_tensor = self.embedding_output #[ def_per_batch, seq_length, hidden_size] with tf.compat.v1.variable_scope("encoder"): num_key_tokens = self.ssdr_config.num_key_tokens project_dim = hidden_size * num_key_tokens raw_key = bc.dense(project_dim, self.initializer)(key) key_tokens = tf.reshape( raw_key, [self.batch_size, num_key_tokens, hidden_size]) input_tensor = tf.concat([key_tokens, input_tensor], axis=1) input_shape = bc.get_shape_list(input_tensor, expected_rank=3) mask_for_key = tf.ones([self.batch_size, num_key_tokens], dtype=tf.int64) self.input_mask = tf.cast(self.input_mask, tf.int64) self.input_mask = tf.concat([mask_for_key, self.input_mask], axis=1) self.seq_length = self.seq_length + num_key_tokens self.attention_mask = bc.create_attention_mask_from_input_mask( input_tensor, self.input_mask) prev_output = bc.reshape_to_matrix(input_tensor) for layer_idx in range(self.ssdr_config.num_hidden_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): intermediate_output, prev_output = self.forward_layer( prev_output) self.all_layer_outputs.append(prev_output) final_output = bc.reshape_from_matrix(prev_output, input_shape) self.scores = bc.dense(1, self.initializer)(final_output[:, 0, :]) if self.ssdr_config.info_pooling_method == "first_tokens": self.info_output = final_output[:, :num_key_tokens, :] elif self.ssdr_config.info_pooling_method == "max_pooling": self.info_output = tf.reduce_max(final_output, axis=1) return self.scores, self.info_output
def build_key(self): with tf.compat.v1.variable_scope("embeddings"): input_tensor = self.get_embeddings(self.input_ids, self.segment_ids) self.input_shape = bc.get_shape_list(input_tensor, expected_rank=3) with tf.compat.v1.variable_scope("encoder"): self.attention_mask = bc.create_attention_mask_from_input_mask( input_tensor, self.input_mask) prev_output = bc.reshape_to_matrix(input_tensor) for layer_idx in range(self.layers_before_key_pooling): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): intermediate_output, prev_output = self.forward_layer( prev_output) intermediate_output = tf.reshape(intermediate_output, [ self.batch_size * self.seq_length, self.config.intermediate_size ]) final_output = bc.reshape_from_matrix( prev_output, self.input_shape) self.all_layer_outputs.append(final_output) self.last_intermediate_output = intermediate_output self.last_key_layer = prev_output with tf.compat.v1.variable_scope("mr_key"): key_vectors = bc.dense(self.key_dimension, self.initializer)(intermediate_output) self.debug1 = key_vectors key_vectors = tf.reshape( key_vectors, [self.batch_size, self.seq_length, self.key_dimension]) key_output = self.key_pooling(key_vectors) return key_output
def sequence_index_prediction(bert_config, lookup_idx, input_tensor): logits = bert_common.dense(2, bert_common.create_initializer(bert_config.initializer_range))(input_tensor) log_probs = tf.nn.softmax(logits, axis=2) losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=lookup_idx) per_example_loss = tf.reduce_sum(losses, axis=1) loss = tf.reduce_mean(per_example_loss) return loss, per_example_loss, log_probs
def forward_layer_with_added(self, prev_output, added_value, locations): hidden_size = self.config.hidden_size layer_input = prev_output attention_output = self_attention_with_add( layer_input, self.attention_mask, self.config, self.batch_size, self.seq_length, hidden_size, self.initializer, added_value, locations) with tf.compat.v1.variable_scope("intermediate"): intermediate_output = bc.dense( self.config.intermediate_size, self.initializer, activation=bc.get_activation( self.config.hidden_act))(attention_output) with tf.compat.v1.variable_scope("output"): layer_output = bc.dense(hidden_size, self.initializer)(intermediate_output) layer_output = bc.dropout(layer_output, self.config.hidden_dropout_prob) layer_output = bc.layer_norm(layer_output + attention_output) prev_output = layer_output return intermediate_output, layer_output
def get_regression_and_loss(hidden_vector, loss_label): logits = bc.dense(2, bc.create_initializer( bert_config.initializer_range))(hidden_vector) gold_prob = loss_to_prob_pair(loss_label) logits = tf.reshape(logits, gold_prob.shape) per_example_loss = tf.nn.softmax_cross_entropy_with_logits(gold_prob, logits, axis=-1, name=None) per_example_loss = tf.cast(masked_lm_weights, tf.float32) * per_example_loss losses = tf.reduce_sum(per_example_loss, axis=1) loss = tf.reduce_mean(losses) return loss, per_example_loss, logits
def self_attention_with_add(layer_input, attention_mask, config, batch_size, seq_length, hidden_size, initializer, values, add_locations): attention_head_size = int(hidden_size / config.num_attention_heads) with tf.compat.v1.variable_scope("attention"): attention_heads = [] with tf.compat.v1.variable_scope("self"): attention_head = bc.attention_layer( from_tensor=layer_input, to_tensor=layer_input, attention_mask=attention_mask, num_attention_heads=config.num_attention_heads, size_per_head=attention_head_size, attention_probs_dropout_prob=config. attention_probs_dropout_prob, initializer_range=config.initializer_range, do_return_2d_tensor=True, batch_size=batch_size, from_seq_length=seq_length, to_seq_length=seq_length) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # [batch*seq_length, hidden_dim] , [batch, n_locations] attention_output = tf.tensor_scatter_nd_add(attention_output, add_locations, values) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.compat.v1.variable_scope("output"): attention_output = bc.dense(hidden_size, initializer)(attention_output) attention_output = bc.dropout(attention_output, config.hidden_dropout_prob) attention_output = bc.layer_norm(attention_output + layer_input) return attention_output
def __init__(self, config, is_training, use_one_hot_embeddings): super(HorizontalAlpha, self).__init__() if not is_training: config.set_attrib("hidden_dropout_prob", 0.0) config.set_attrib("attention_probs_dropout_prob", 0.0) initializer = bc.create_initializer(config.initializer_range) self.embedding_layer = Embedding2() self.embedding_projector = bc.dense(config.hidden_size, initializer) self.config = config num_columns = config.num_columns self.column_list = [] for tower_idx in range(num_columns): column = ForwardColumn(config) self.column_list.append(column) self.num_layers = config.num_hidden_layers self.num_columns = config.num_columns self.num_column_tokens = config.num_column_tokens self.column_embedding_list = [] self.use_one_hot_embeddings = use_one_hot_embeddings self.config = config column_mask = [] for column_idx in range(1, self.num_columns): column_embedding = tf.Variable( lambda: initializer(shape=(self.num_column_tokens, config. hidden_size), dtype=tf.float32), name="column_embedding_{}".format(column_idx)) self.column_embedding_list.append(column_embedding) column_mask += [1] * self.num_column_tokens self.column_mask = tf.constant(column_mask) self.all_raw_layers = [] self.all_main_layers = [] self.sequence_output = None self.pooled_output = None
def build(self): with tf.compat.v1.variable_scope("dict"): with tf.compat.v1.variable_scope("embeddings"): input_tensor = self.get_embeddings(self.input_ids, self.segment_ids) with tf.compat.v1.variable_scope("encoder"): num_key_tokens = self.ssdr_config.num_key_tokens input_shape = bc.get_shape_list(input_tensor, expected_rank=3) mask_for_key = tf.ones([self.batch_size, num_key_tokens], dtype=tf.int64) self.input_mask = tf.cast(self.input_mask, tf.int64) self.input_mask = tf.concat([mask_for_key, self.input_mask], axis=1) self.seq_length = self.seq_length + num_key_tokens self.attention_mask = bc.create_attention_mask_from_input_mask( input_tensor, self.input_mask) prev_output = bc.reshape_to_matrix(input_tensor) for layer_idx in range(self.ssdr_config.num_hidden_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): intermediate_output, prev_output = self.forward_layer( prev_output) self.all_layer_outputs.append(prev_output) final_output = bc.reshape_from_matrix(prev_output, input_shape) self.scores = bc.dense(1, self.initializer)(final_output[:, 0, :]) if self.ssdr_config.info_pooling_method == "first_tokens": self.info_output = final_output[:, :num_key_tokens, :] elif self.ssdr_config.info_pooling_method == "max_pooling": self.info_output = tf.reduce_max(final_output, axis=1) return self.scores, self.info_output
def get_loss_independently(bert_config, input_tensor, masked_lm_positions, masked_lm_weights, loss_base, loss_target): input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions) hidden = bc.dense(bert_config.hidden_size, bc.create_initializer(bert_config.initializer_range), bc.get_activation(bert_config.hidden_act))(input_tensor) def get_regression_and_loss(hidden_vector, loss_label): logits = bc.dense(2, bc.create_initializer( bert_config.initializer_range))(hidden_vector) gold_prob = loss_to_prob_pair(loss_label) logits = tf.reshape(logits, gold_prob.shape) per_example_loss = tf.nn.softmax_cross_entropy_with_logits(gold_prob, logits, axis=-1, name=None) per_example_loss = tf.cast(masked_lm_weights, tf.float32) * per_example_loss losses = tf.reduce_sum(per_example_loss, axis=1) loss = tf.reduce_mean(losses) return loss, per_example_loss, logits loss1, per_example_loss1, logits1 = get_regression_and_loss( hidden, loss_base) loss2, per_example_loss2, logits2 = get_regression_and_loss( hidden, loss_target) prob1 = tf.nn.softmax(logits1)[:, :, 0] prob2 = tf.nn.softmax(logits2)[:, :, 0] total_loss = loss1 + loss2 return total_loss, loss1, loss2, per_example_loss1, per_example_loss2, prob1, prob2
def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" logging.info("*** Features ***") for name in sorted(features.keys()): logging.info(" name = %s, shape = %s" % (name, features[name].shape)) input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] d_input_ids = features["d_input_ids"] d_input_mask = features["d_input_mask"] d_location_ids = features["d_location_ids"] next_sentence_labels = features["next_sentence_labels"] if dict_run_config.prediction_op == "loss": seed = 0 else: seed = None if dict_run_config.prediction_op == "loss_fixed_mask" or train_config.fixed_mask: masked_input_ids = input_ids masked_lm_positions = features["masked_lm_positions"] masked_lm_ids = features["masked_lm_ids"] masked_lm_weights = tf.ones_like(masked_lm_positions, dtype=tf.float32) else: masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights \ = random_masking(input_ids, input_mask, train_config.max_predictions_per_seq, MASK_ID, seed) if dict_run_config.use_d_segment_ids: d_segment_ids = features["d_segment_ids"] else: d_segment_ids = None is_training = (mode == tf.estimator.ModeKeys.TRAIN) model = model_class( config=bert_config, d_config=dbert_config, is_training=is_training, input_ids=masked_input_ids, input_mask=input_mask, d_input_ids=d_input_ids, d_input_mask=d_input_mask, d_location_ids=d_location_ids, use_target_pos_emb=dict_run_config.use_target_pos_emb, token_type_ids=segment_ids, use_one_hot_embeddings=train_config.use_one_hot_embeddings, d_segment_ids=d_segment_ids, pool_dict_output=dict_run_config.pool_dict_output, ) (masked_lm_loss, masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( bert_config, model.get_sequence_output(), model.get_embedding_table(), masked_lm_positions, masked_lm_ids, masked_lm_weights) (next_sentence_loss, next_sentence_example_loss, next_sentence_log_probs) = get_next_sentence_output( bert_config, model.get_pooled_output(), next_sentence_labels) total_loss = masked_lm_loss if dict_run_config.train_op == "entry_prediction": score_label = features["useful_entry"] # [batch, 1] score_label = tf.reshape(score_label, [-1]) entry_logits = bert_common.dense(2, bert_common.create_initializer(bert_config.initializer_range))\ (model.get_dict_pooled_output()) print("entry_logits: ", entry_logits.shape) losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=entry_logits, labels=score_label) loss = tf.reduce_mean(losses) total_loss = loss if dict_run_config.train_op == "lookup": lookup_idx = features["lookup_idx"] lookup_loss, lookup_example_loss, lookup_score = \ sequence_index_prediction(bert_config, lookup_idx, model.get_sequence_output()) total_loss += lookup_loss tvars = tf.compat.v1.trainable_variables() init_vars = {} scaffold_fn = None if train_config.init_checkpoint: if dict_run_config.is_bert_checkpoint: map1, map2, init_vars = get_bert_assignment_map_for_dict(tvars, train_config.init_checkpoint) def load_fn(): tf.compat.v1.train.init_from_checkpoint(train_config.init_checkpoint, map1) tf.compat.v1.train.init_from_checkpoint(train_config.init_checkpoint, map2) else: map1, init_vars = get_assignment_map_as_is(tvars, train_config.init_checkpoint) def load_fn(): tf.compat.v1.train.init_from_checkpoint(train_config.init_checkpoint, map1) if train_config.use_tpu: def tpu_scaffold(): load_fn() return tf.compat.v1.train.Scaffold() scaffold_fn = tpu_scaffold else: load_fn() logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in init_vars: init_string = ", *INIT_FROM_CKPT*" logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) logging.info("Total parameters : %d" % get_param_num()) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: if train_config.gradient_accumulation == 1: train_op = optimization.create_optimizer_from_config(total_loss, train_config) else: logging.info("Using gradient accumulation : %d" % train_config.gradient_accumulation) train_op = get_accumulated_optimizer_from_config(total_loss, train_config, tvars, train_config.gradient_accumulation) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: eval_metrics = (metric_fn, [ masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, masked_lm_weights, next_sentence_example_loss, next_sentence_log_probs, next_sentence_labels ]) output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, eval_metrics=eval_metrics, scaffold_fn=scaffold_fn) else: if dict_run_config.prediction_op == "gradient": logging.info("Fetching gradient") gradient = get_gradients(model, masked_lm_log_probs, train_config.max_predictions_per_seq, bert_config.vocab_size) predictions = { "masked_input_ids": masked_input_ids, #"input_ids": input_ids, "d_input_ids": d_input_ids, "masked_lm_positions": masked_lm_positions, "gradients": gradient, } elif dict_run_config.prediction_op == "loss" or dict_run_config.prediction_op == "loss_fixed_mask": logging.info("Fetching loss") predictions = { "masked_lm_example_loss": masked_lm_example_loss, } else: raise Exception("prediction target not specified") output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec
def binary_prediction(bert_config, input_tensor): logits = bert_common.dense(2, bert_common.create_initializer(bert_config.initializer_range))(input_tensor) log_probs = tf.nn.softmax(logits, axis=2) return logits, log_probs
def embedding_projection(self, input_tensor): with tf.compat.v1.variable_scope("embedding_projection", reuse=True): return bc.dense(self.config.hidden_size, self.initializer)(input_tensor)